diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index b0a2427732a6a..d00dfc87a0584 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -69,6 +69,10 @@ jobs: env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" + - name: "Copy-on-Write (warnings)" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "warn" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 888cfc24aea5b..be8249cd3a287 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -456,6 +456,12 @@ be located. - tests.io + .. note:: + + This includes ``to_string`` but excludes ``__repr__``, which is + tested in ``tests.frame.test_repr`` and ``tests.series.test_repr``. + Other classes often have a ``test_formats`` file. + C) Otherwise This test likely belongs in one of: diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 2b3e4707b8e0b..38416afc1c94c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_212: -What's new in 2.1.2 (October 25, 2023) +What's new in 2.1.2 (October 26, 2023) --------------------------------------- These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ff19ef4eae179..d9909b0dbfad8 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -323,12 +323,14 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) +- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - Timedelta @@ -349,6 +351,7 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :func:`astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) - @@ -390,6 +393,7 @@ I/O - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) +- Bug in :meth:`pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) - Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) - @@ -441,9 +445,9 @@ Other - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) -- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index daeb135f5bcf7..97784c924dab4 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -15,6 +15,7 @@ "option_context", "options", "using_copy_on_write", + "warn_copy_on_write", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 @@ -32,7 +33,18 @@ def using_copy_on_write() -> bool: _mode_options = _global_config["mode"] - return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block" + return ( + _mode_options["copy_on_write"] is True + and _mode_options["data_manager"] == "block" + ) + + +def warn_copy_on_write() -> bool: + _mode_options = _global_config["mode"] + return ( + _mode_options["copy_on_write"] == "warn" + and _mode_options["data_manager"] == "block" + ) def using_nullable_dtypes() -> bool: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 9eed70a23c9dd..b330b37aebd8d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -814,7 +814,7 @@ def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): return True, True, True if timelike and arr[0] == NPY_NAT: - return False, False, True + return False, False, False if numeric_object_t is not object: with nogil: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c0f21d1a7044c..7e17851de8ecc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -792,7 +792,8 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and result is arr: + if copy and (result is arr or np.shares_memory(arr, result)): + # GH#54654 result = result.copy() elif not copy and result is arr: already_copied = False diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 9819b5173db56..7f95bfc717633 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -23,6 +23,7 @@ def array_to_datetime( dayfirst: bool = ..., yearfirst: bool = ..., utc: bool = ..., + creso: int = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8fdbd9affa58a..d2eeea78ee7e8 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -64,6 +64,7 @@ from pandas._libs.tslibs.conversion cimport ( get_datetime64_nanos, parse_pydatetime, ) +from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, @@ -277,6 +278,7 @@ def array_with_unit_to_datetime( result, tz = array_to_datetime( values.astype(object, copy=False), errors=errors, + creso=NPY_FR_ns, ) return result, tz @@ -408,6 +410,7 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, + NPY_DATETIMEUNIT creso=NPY_FR_ns, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -434,6 +437,7 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC + creso : NPY_DATETIMEUNIT, default NPY_FR_ns Returns ------- @@ -457,13 +461,14 @@ cpdef array_to_datetime( set out_tzoffset_vals = set() tzinfo tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) - NPY_DATETIMEUNIT creso = NPY_FR_ns DatetimeParseState state = DatetimeParseState() + str reso_str # specify error conditions assert is_raise or is_ignore or is_coerce - result = np.empty((values).shape, dtype="M8[ns]") + reso_str = npy_unit_to_abbrev(creso) + result = np.empty((values).shape, dtype=f"M8[{reso_str}]") iresult = result.view("i8").ravel() for i in range(n): @@ -477,14 +482,14 @@ cpdef array_to_datetime( elif PyDateTime_Check(val): tz_out = state.process_datetime(val, tz_out, utc_convert) - iresult[i] = parse_pydatetime(val, &dts, utc_convert, creso=creso) + iresult[i] = parse_pydatetime(val, &dts, creso=creso) elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) + iresult[i] = pydate_to_dt64(val, &dts, reso=creso) + check_dts_bounds(&dts, creso) elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + iresult[i] = get_datetime64_nanos(val, creso) elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition @@ -493,7 +498,7 @@ cpdef array_to_datetime( iresult[i] = NPY_NAT else: # we now need to parse this as if unit='ns' - iresult[i] = cast_from_unit(val, "ns") + iresult[i] = cast_from_unit(val, "ns", out_reso=creso) elif isinstance(val, str): # string @@ -501,7 +506,7 @@ cpdef array_to_datetime( # GH#32264 np.str_ object val = str(val) - if parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc, creso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" continue @@ -509,7 +514,7 @@ cpdef array_to_datetime( _ts = convert_str_to_tsobject( val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst ) - _ts.ensure_reso(NPY_FR_ns, val) + _ts.ensure_reso(creso, val) iresult[i] = _ts.value @@ -519,10 +524,6 @@ cpdef array_to_datetime( # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() out_tzoffset_vals.add(nsecs) - # need to set seen_datetime_offset *after* the - # potentially-raising timezone(timedelta(...)) call, - # otherwise we can go down the is_same_offsets path - # bc len(out_tzoffset_vals) == 0 seen_datetime_offset = True else: # Add a marker for naive string, to track if we are diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 36c57a8b9ce24..ec743dbf98f6b 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -43,7 +43,9 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*) +cpdef (int64_t, int) precision_from_unit( + NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* +) cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso) @@ -51,6 +53,5 @@ cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso) cdef int64_t parse_pydatetime( datetime val, npy_datetimestruct *dts, - bint utc_convert, NPY_DATETIMEUNIT creso, ) except? -1 diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index d564d767f7f05..be75c2f2f68ea 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -9,6 +9,6 @@ DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype def precision_from_unit( - unit: str, + in_reso: int, # NPY_DATETIMEUNIT ) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index ffc17e5d23258..dc1cc906c9d07 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -106,6 +106,7 @@ cdef int64_t cast_from_unit( cdef: int64_t m int p + NPY_DATETIMEUNIT in_reso if unit in ["Y", "M"]: if is_float_object(ts) and not ts.is_integer(): @@ -123,7 +124,14 @@ cdef int64_t cast_from_unit( dt64obj = np.datetime64(ts, unit) return get_datetime64_nanos(dt64obj, out_reso) - m, p = precision_from_unit(unit, out_reso) + in_reso = abbrev_to_npy_unit(unit) + if out_reso < in_reso and in_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # We will end up rounding (always *down*), so don't need the fractional + # part of `ts`. + m, _ = precision_from_unit(out_reso, in_reso) + return (ts) // m + + m, p = precision_from_unit(in_reso, out_reso) # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int @@ -146,8 +154,8 @@ cdef int64_t cast_from_unit( ) from err -cpdef inline (int64_t, int) precision_from_unit( - str unit, +cpdef (int64_t, int) precision_from_unit( + NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): """ @@ -163,17 +171,16 @@ cpdef inline (int64_t, int) precision_from_unit( int64_t m int64_t multiplier int p - NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit) - if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: - reso = NPY_DATETIMEUNIT.NPY_FR_ns - if reso == NPY_DATETIMEUNIT.NPY_FR_Y: + if in_reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + in_reso = NPY_DATETIMEUNIT.NPY_FR_ns + if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y: # each 400 years we have 97 leap years, for an average of 97/400=.2425 # extra days each year. We get 31556952 by writing # 3600*24*365.2425=31556952 multiplier = periods_per_second(out_reso) m = multiplier * 31556952 - elif reso == NPY_DATETIMEUNIT.NPY_FR_M: + elif in_reso == NPY_DATETIMEUNIT.NPY_FR_M: # 2629746 comes from dividing the "Y" case by 12. multiplier = periods_per_second(out_reso) m = multiplier * 2629746 @@ -181,7 +188,7 @@ cpdef inline (int64_t, int) precision_from_unit( # Careful: if get_conversion_factor raises, the exception does # not propagate, instead we get a warning about an ignored exception. # https://github.com/pandas-dev/pandas/pull/51483#discussion_r1115198951 - m = get_conversion_factor(reso, out_reso) + m = get_conversion_factor(in_reso, out_reso) p = log10(m) # number of digits in 'm' minus 1 return m, p @@ -409,44 +416,22 @@ cdef _TSObject convert_datetime_to_tsobject( return obj -cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, - int tzoffset, tzinfo tz=None, - NPY_DATETIMEUNIT reso=NPY_FR_ns): +cdef _adjust_tsobject_tz_using_offset(_TSObject obj, tzinfo tz): """ - Convert a datetimestruct `dts`, along with initial timezone offset - `tzoffset` to a _TSObject (with timezone object `tz` - optional). + Convert a datetimestruct `obj.dts`, with an attached tzinfo to a new + user-provided tz. Parameters ---------- - dts : npy_datetimestruct - tzoffset : int - tz : tzinfo or None - timezone for the timezone-aware output. - reso : NPY_DATETIMEUNIT, default NPY_FR_ns - - Returns - ------- obj : _TSObject + tz : tzinfo + timezone for the timezone-aware output. """ cdef: - _TSObject obj = _TSObject() - int64_t value # numpy dt64 datetime dt Py_ssize_t pos - - value = npy_datetimestruct_to_datetime(reso, &dts) - obj.dts = dts - obj.tzinfo = timezone(timedelta(minutes=tzoffset)) - obj.value = tz_localize_to_utc_single( - value, obj.tzinfo, ambiguous=None, nonexistent=None, creso=reso - ) - obj.creso = reso - if tz is None: - check_overflows(obj, reso) - return obj - - cdef: - Localizer info = Localizer(tz, reso) + int64_t ps = obj.dts.ps + Localizer info = Localizer(tz, obj.creso) # Infer fold from offset-adjusted obj.value # see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute @@ -462,10 +447,15 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, obj.tzinfo, fold=obj.fold) - obj = convert_datetime_to_tsobject( - dt, tz, nanos=obj.dts.ps // 1000) - obj.ensure_reso(reso) # TODO: more performant to get reso right up front? - return obj + + # The rest here is similar to the 2-tz path in convert_datetime_to_tsobject + # but avoids re-calculating obj.value + dt = dt.astimezone(tz) + pydatetime_to_dtstruct(dt, &obj.dts) + obj.tzinfo = dt.tzinfo + obj.dts.ps = ps + check_dts_bounds(&obj.dts, obj.creso) + check_overflows(obj, obj.creso) cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, @@ -502,6 +492,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, datetime dt int64_t ival NPY_DATETIMEUNIT out_bestunit, reso + _TSObject obj if len(ts) == 0 or ts in nat_strings: obj = _TSObject() @@ -525,21 +516,28 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, if not string_to_dts_failed: reso = get_supported_reso(out_bestunit) check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + if out_local == 1: - return _create_tsobject_tz_using_offset( - dts, out_tzoffset, tz, reso + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso ) + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj else: - ival = npy_datetimestruct_to_datetime(reso, &dts) if tz is not None: # shift for _localize_tso ival = tz_localize_to_utc_single( ival, tz, ambiguous="raise", nonexistent=None, creso=reso ) - obj = _TSObject() - obj.dts = dts obj.value = ival - obj.creso = reso maybe_localize_tso(obj, tz, obj.creso) return obj @@ -675,7 +673,6 @@ cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz): cdef int64_t parse_pydatetime( datetime val, npy_datetimestruct *dts, - bint utc_convert, NPY_DATETIMEUNIT creso, ) except? -1: """ @@ -687,8 +684,6 @@ cdef int64_t parse_pydatetime( Element being processed. dts : *npy_datetimestruct Needed to use in pydatetime_to_dt64, which writes to it. - utc_convert : bool - Whether to convert/localize to UTC. creso : NPY_DATETIMEUNIT Resolution to store the the result. @@ -701,12 +696,8 @@ cdef int64_t parse_pydatetime( int64_t result if val.tzinfo is not None: - if utc_convert: - _ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso) - result = _ts.value - else: - _ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso) - result = _ts.value + _ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso) + result = _ts.value else: if isinstance(val, _Timestamp): result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index d09612f4fbf7d..32a8edc9ee4a3 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -4,8 +4,10 @@ from cpython.datetime cimport ( ) from numpy cimport int64_t +from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cdef bint parse_today_now(str val, int64_t* iresult, bint utc) + +cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso) cdef class DatetimeParseState: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d1c6bab180576..69466511a67fd 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -80,6 +80,7 @@ from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() + cdef bint format_is_iso(f: str): """ Does format match the iso8601 set that can be handled by the C parser? @@ -110,22 +111,27 @@ def _test_format_is_iso(f: str) -> bool: return format_is_iso(f) -cdef bint parse_today_now(str val, int64_t* iresult, bint utc): +cdef bint parse_today_now( + str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso +): # We delay this check for as long as possible # because it catches relatively rare cases + cdef: + _Timestamp ts - # Multiply by 1000 to convert to nanos, since these methods naturally have - # microsecond resolution if val == "now": if utc: - iresult[0] = Timestamp.utcnow()._value * 1000 + ts = <_Timestamp>Timestamp.utcnow() + iresult[0] = ts._as_creso(creso)._value else: # GH#18705 make sure to_datetime("now") matches Timestamp("now") # Note using Timestamp.now() is faster than Timestamp("now") - iresult[0] = Timestamp.now()._value * 1000 + ts = <_Timestamp>Timestamp.now() + iresult[0] = ts._as_creso(creso)._value return True elif val == "today": - iresult[0] = Timestamp.today()._value * 1000 + ts = <_Timestamp>Timestamp.today() + iresult[0] = ts._as_creso(creso)._value return True return False @@ -154,6 +160,77 @@ cdef dict _parse_code_table = {"y": 0, "u": 22} +cdef _validate_fmt(str fmt): + if "%W" in fmt or "%U" in fmt: + if "%Y" not in fmt and "%y" not in fmt: + raise ValueError("Cannot use '%W' or '%U' without day and year") + if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt: + raise ValueError("Cannot use '%W' or '%U' without day and year") + elif "%Z" in fmt and "%z" in fmt: + raise ValueError("Cannot parse both %Z and %z") + elif "%j" in fmt and "%G" in fmt: + raise ValueError("Day of the year directive '%j' is not " + "compatible with ISO year directive '%G'. " + "Use '%Y' instead.") + elif "%G" in fmt and ( + "%V" not in fmt + or not ( + "%A" in fmt + or "%a" in fmt + or "%w" in fmt + or "%u" in fmt + ) + ): + raise ValueError("ISO year directive '%G' must be used with " + "the ISO week directive '%V' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") + elif "%V" in fmt and "%Y" in fmt: + raise ValueError("ISO week directive '%V' is incompatible with " + "the year directive '%Y'. Use the ISO year " + "'%G' instead.") + elif "%V" in fmt and ( + "%G" not in fmt + or not ( + "%A" in fmt + or "%a" in fmt + or "%w" in fmt + or "%u" in fmt + ) + ): + raise ValueError("ISO week directive '%V' must be used with " + "the ISO year directive '%G' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") + + +cdef _get_format_regex(str fmt): + global _TimeRE_cache, _regex_cache + with _cache_lock: + if _getlang() != _TimeRE_cache.locale_time.lang: + _TimeRE_cache = TimeRE() + _regex_cache.clear() + if len(_regex_cache) > _CACHE_MAX_SIZE: + _regex_cache.clear() + locale_time = _TimeRE_cache.locale_time + format_regex = _regex_cache.get(fmt) + if not format_regex: + try: + format_regex = _TimeRE_cache.compile(fmt) + except KeyError, err: + # KeyError raised when a bad format is found; can be specified as + # \\, in which case it was a stray % but with a space after it + bad_directive = err.args[0] + if bad_directive == "\\": + bad_directive = "%" + del err + raise ValueError(f"'{bad_directive}' is a bad directive " + f"in format '{fmt}'") + except IndexError: + # IndexError only occurs when the format string is "%" + raise ValueError(f"stray % in format '{fmt}'") + _regex_cache[fmt] = format_regex + return format_regex, locale_time + + cdef class DatetimeParseState: def __cinit__(self): self.found_tz = False @@ -221,71 +298,8 @@ def array_strptime( assert is_raise or is_ignore or is_coerce - if "%W" in fmt or "%U" in fmt: - if "%Y" not in fmt and "%y" not in fmt: - raise ValueError("Cannot use '%W' or '%U' without day and year") - if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt: - raise ValueError("Cannot use '%W' or '%U' without day and year") - elif "%Z" in fmt and "%z" in fmt: - raise ValueError("Cannot parse both %Z and %z") - elif "%j" in fmt and "%G" in fmt: - raise ValueError("Day of the year directive '%j' is not " - "compatible with ISO year directive '%G'. " - "Use '%Y' instead.") - elif "%G" in fmt and ( - "%V" not in fmt - or not ( - "%A" in fmt - or "%a" in fmt - or "%w" in fmt - or "%u" in fmt - ) - ): - raise ValueError("ISO year directive '%G' must be used with " - "the ISO week directive '%V' and a weekday " - "directive '%A', '%a', '%w', or '%u'.") - elif "%V" in fmt and "%Y" in fmt: - raise ValueError("ISO week directive '%V' is incompatible with " - "the year directive '%Y'. Use the ISO year " - "'%G' instead.") - elif "%V" in fmt and ( - "%G" not in fmt - or not ( - "%A" in fmt - or "%a" in fmt - or "%w" in fmt - or "%u" in fmt - ) - ): - raise ValueError("ISO week directive '%V' must be used with " - "the ISO year directive '%G' and a weekday " - "directive '%A', '%a', '%w', or '%u'.") - - global _TimeRE_cache, _regex_cache - with _cache_lock: - if _getlang() != _TimeRE_cache.locale_time.lang: - _TimeRE_cache = TimeRE() - _regex_cache.clear() - if len(_regex_cache) > _CACHE_MAX_SIZE: - _regex_cache.clear() - locale_time = _TimeRE_cache.locale_time - format_regex = _regex_cache.get(fmt) - if not format_regex: - try: - format_regex = _TimeRE_cache.compile(fmt) - # KeyError raised when a bad format is found; can be specified as - # \\, in which case it was a stray % but with a space after it - except KeyError, err: - bad_directive = err.args[0] - if bad_directive == "\\": - bad_directive = "%" - del err - raise ValueError(f"'{bad_directive}' is a bad directive " - f"in format '{fmt}'") - # IndexError only occurs when the format string is "%" - except IndexError: - raise ValueError(f"stray % in format '{fmt}'") - _regex_cache[fmt] = format_regex + _validate_fmt(fmt) + format_regex, locale_time = _get_format_regex(fmt) result = np.empty(n, dtype="M8[ns]") iresult = result.view("i8") @@ -354,7 +368,7 @@ def array_strptime( check_dts_bounds(&dts) continue - if parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc, NPY_FR_ns): continue # Some ISO formats can't be parsed by string_to_dts @@ -366,8 +380,10 @@ def array_strptime( raise ValueError(f"Time data {val} is not ISO8601 format") tz = _parse_with_format( - val, fmt, exact, format_regex, locale_time, &iresult[i] + val, fmt, exact, format_regex, locale_time, &dts ) + iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + check_dts_bounds(&dts) result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: @@ -391,10 +407,10 @@ def array_strptime( cdef tzinfo _parse_with_format( - str val, str fmt, bint exact, format_regex, locale_time, int64_t* iresult + str val, str fmt, bint exact, format_regex, locale_time, npy_datetimestruct* dts ): + # Based on https://github.com/python/cpython/blob/main/Lib/_strptime.py#L293 cdef: - npy_datetimestruct dts int year, month, day, minute, hour, second, weekday, julian int week_of_year, week_of_year_start, parse_code, ordinal int iso_week, iso_year @@ -452,24 +468,32 @@ cdef tzinfo _parse_with_format( # value in the range of [00, 68] is in the century 2000, while # [69,99] is in the century 1900 if year <= 68: + # e.g. val='May 04'; fmt='%b %y' year += 2000 else: year += 1900 + # TODO: not reached in tests 2023-10-28 elif parse_code == 1: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' year = int(found_dict["Y"]) elif parse_code == 2: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' month = int(found_dict["m"]) # elif group_key == 'B': elif parse_code == 3: + # e.g. val='30/December/2011'; fmt='%d/%B/%Y' month = locale_time.f_month.index(found_dict["B"].lower()) # elif group_key == 'b': elif parse_code == 4: + # e.g. val='30/Dec/2011 00:00:00'; fmt='%d/%b/%Y %H:%M:%S' month = locale_time.a_month.index(found_dict["b"].lower()) # elif group_key == 'd': elif parse_code == 5: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' day = int(found_dict["d"]) # elif group_key == 'H': elif parse_code == 6: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' hour = int(found_dict["H"]) elif parse_code == 7: hour = int(found_dict["I"]) @@ -481,17 +505,27 @@ cdef tzinfo _parse_with_format( # 12 midnight == 12 AM == hour 0 if hour == 12: hour = 0 + # TODO: not reached in tests 2023-10-28; the implicit `else` + # branch is tested with e.g. + # val='Tuesday 24 Aug 2021 01:30:48 AM' + # fmt='%A %d %b %Y %I:%M:%S %p' elif ampm == locale_time.am_pm[1]: # We're in PM so we need to add 12 to the hour unless # we're looking at 12 noon. # 12 noon == 12 PM == hour 12 if hour != 12: + # e.g. val='01/10/2010 08:14 PM'; fmt='%m/%d/%Y %I:%M %p' hour += 12 + # TODO: the implicit `else` branch is not tested 2023-10-28 + # TODO: the implicit `else` branch is not reached 2023-10-28; possible? elif parse_code == 8: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' minute = int(found_dict["M"]) elif parse_code == 9: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' second = int(found_dict["S"]) elif parse_code == 10: + # e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f' s = found_dict["f"] # Pad to always return nanoseconds s += "0" * (9 - len(s)) @@ -499,34 +533,46 @@ cdef tzinfo _parse_with_format( ns = us % 1000 us = us // 1000 elif parse_code == 11: + # e.g val='Tuesday 24 Aug 2021 01:30:48 AM'; fmt='%A %d %b %Y %I:%M:%S %p' weekday = locale_time.f_weekday.index(found_dict["A"].lower()) elif parse_code == 12: + # e.g. val='Tue 24 Aug 2021 01:30:48 AM'; fmt='%a %d %b %Y %I:%M:%S %p' weekday = locale_time.a_weekday.index(found_dict["a"].lower()) elif parse_code == 13: weekday = int(found_dict["w"]) if weekday == 0: + # e.g. val='2013020'; fmt='%Y%U%w' weekday = 6 else: + # e.g. val='2009324'; fmt='%Y%W%w' weekday -= 1 elif parse_code == 14: + # e.g. val='2009164202000'; fmt='%Y%j%H%M%S' julian = int(found_dict["j"]) elif parse_code == 15 or parse_code == 16: week_of_year = int(found_dict[group_key]) if group_key == "U": + # e.g. val='2013020'; fmt='%Y%U%w' # U starts week on Sunday. week_of_year_start = 6 else: + # e.g. val='2009324'; fmt='%Y%W%w' # W starts week on Monday. week_of_year_start = 0 elif parse_code == 17: + # e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z' tz = pytz.timezone(found_dict["Z"]) elif parse_code == 19: + # e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z' tz = parse_timezone_directive(found_dict["z"]) elif parse_code == 20: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' iso_year = int(found_dict["G"]) elif parse_code == 21: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' iso_week = int(found_dict["V"]) elif parse_code == 22: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' weekday = int(found_dict["u"]) weekday -= 1 @@ -534,18 +580,26 @@ cdef tzinfo _parse_with_format( # out the Julian day of the year. if julian == -1 and weekday != -1: if week_of_year != -1: + # e.g. val='2013020'; fmt='%Y%U%w' week_starts_Mon = week_of_year_start == 0 julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, week_starts_Mon) elif iso_year != -1 and iso_week != -1: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1) + # else: + # # e.g. val='Thu Sep 25 2003'; fmt='%a %b %d %Y' + # pass + # Cannot pre-calculate date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. + # We don't actually need ordinal/julian here, but need to raise + # on e.g. val='2015-04-31'; fmt='%Y-%m-%d' ordinal = date(year, month, day).toordinal() julian = ordinal - date(year, 1, 1).toordinal() + 1 else: @@ -557,6 +611,9 @@ cdef tzinfo _parse_with_format( month = datetime_result.month day = datetime_result.day if weekday == -1: + # We don't actually use weekday here, but need to do this in order to + # raise on y/m/d combinations + # TODO: not reached in tests 2023-10-28; necessary? weekday = date(year, month, day).weekday() dts.year = year @@ -567,10 +624,6 @@ cdef tzinfo _parse_with_format( dts.sec = second dts.us = us dts.ps = ns * 1000 - - iresult[0] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - check_dts_bounds(&dts) - return tz diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a423137c68123..e67c0fd91cd6f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -303,18 +303,16 @@ cdef object ensure_td64ns(object ts): cdef: NPY_DATETIMEUNIT td64_unit int64_t td64_value, mult - str unitstr td64_unit = get_datetime64_unit(ts) if ( td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC ): - unitstr = npy_unit_to_abbrev(td64_unit) td64_value = cnp.get_timedelta64_value(ts) - mult = precision_from_unit(unitstr)[0] + mult = precision_from_unit(td64_unit)[0] try: # NB: cython#1381 this cannot be *= td64_value = td64_value * mult diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 160e2964896ba..81cd504119c38 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -88,6 +88,7 @@ get_obj, ) from pandas._testing.contexts import ( + assert_cow_warning, decompress_file, ensure_clean, raises_chained_assignment_error, @@ -1097,6 +1098,7 @@ def shares_memory(left, right) -> bool: "assert_series_equal", "assert_sp_array_equal", "assert_timedelta_array_equal", + "assert_cow_warning", "at", "BOOL_DTYPES", "box_expected", diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index b2bb8e71fdf5c..6edbd047699ed 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -214,3 +214,29 @@ def raises_chained_assignment_error(extra_warnings=(), extra_match=()): (ChainedAssignmentError, *extra_warnings), match="|".join((match, *extra_match)), ) + + +def assert_cow_warning(warn=True, match=None, **kwargs): + """ + Assert that a warning is raised in the CoW warning mode. + + Parameters + ---------- + warn : bool, default True + By default, check that a warning is raised. Can be turned off by passing False. + match : str + The warning message to match against, if different from the default. + kwargs + Passed through to assert_produces_warning + """ + from pandas._testing import assert_produces_warning + + if not warn: + from contextlib import nullcontext + + return nullcontext() + + if not match: + match = "Setting a value on a view" + + return assert_produces_warning(FutureWarning, match=match, **kwargs) diff --git a/pandas/conftest.py b/pandas/conftest.py index 7f24b8370c8eb..b62a4391ca654 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1995,7 +1995,18 @@ def using_copy_on_write() -> bool: Fixture to check if Copy-on-Write is enabled. """ return ( - pd.options.mode.copy_on_write + pd.options.mode.copy_on_write is True + and _get_option("mode.data_manager", silent=True) == "block" + ) + + +@pytest.fixture +def warn_copy_on_write() -> bool: + """ + Fixture to check if Copy-on-Write is enabled. + """ + return ( + pd.options.mode.copy_on_write == "warn" and _get_option("mode.data_manager", silent=True) == "block" ) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index eb7a38df3fee9..3fff5cb2aa0c7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2251,6 +2251,7 @@ def _sequence_to_dt64ns( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, + out_unit=out_unit or "ns", ) copy = False if tz and inferred_tz: @@ -2258,11 +2259,11 @@ def _sequence_to_dt64ns( assert converted.dtype == "i8" # GH#42505 # by convention, these are _already_ UTC, e.g - result = converted.view(DT64NS_DTYPE) + result = converted.view(out_dtype) elif inferred_tz: tz = inferred_tz - result = converted.view(DT64NS_DTYPE) + result = converted.view(out_dtype) else: result, _ = _construct_from_dt64_naive( @@ -2360,6 +2361,7 @@ def objects_to_datetime64ns( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, + out_unit: str = "ns", ): """ Convert data to array of timestamps. @@ -2375,6 +2377,7 @@ def objects_to_datetime64ns( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. + out_unit : str, default "ns" Returns ------- @@ -2399,6 +2402,7 @@ def objects_to_datetime64ns( utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, + creso=abbrev_to_npy_unit(out_unit), ) if tz_parsed is not None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 128f1c73062c0..079ac3562c3d9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -29,6 +29,7 @@ to_offset, ) from pandas._libs.tslibs.conversion import precision_from_unit +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -1078,7 +1079,7 @@ def sequence_to_td64ns( else: mask = np.isnan(data) # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(unit or "ns") + m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns")) with warnings.catch_warnings(): # Suppress RuntimeWarning about All-NaN slice warnings.filterwarnings( diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4652acdcae287..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -476,9 +476,11 @@ def use_inf_as_na_cb(key) -> None: "copy_on_write", # Get the default from an environment variable, if set, otherwise defaults # to False. This environment variable can be set for testing. - os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", + "warn" + if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn" + else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", copy_on_write_doc, - validator=is_bool, + validator=is_one_of_factory([True, False, "warn"]), ) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index ac3a44276ac6d..f5579082c679b 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -105,11 +105,10 @@ def _astype_nansafe( # then coerce to datetime64[ns] and use DatetimeArray.astype if lib.is_np_dtype(dtype, "M"): - from pandas import to_datetime + from pandas.core.arrays import DatetimeArray - dti = to_datetime(arr.ravel()) - dta = dti._data.reshape(arr.shape) - return dta.astype(dtype, copy=False)._ndarray + dta = DatetimeArray._from_sequence(arr, dtype=dtype) + return dta._ndarray elif lib.is_np_dtype(dtype, "m"): from pandas.core.construction import ensure_wrapped_if_datetimelike diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b489c14ac0c42..9ec662a6cd352 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -129,7 +129,16 @@ def concat_compat( # i.e. isinstance(to_concat[0], ExtensionArray) to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) cls = type(to_concat[0]) - return cls._concat_same_type(to_concat_eas) + # GH#53640: eg. for datetime array, axis=1 but 0 is default + # However, class method `_concat_same_type()` for some classes + # may not support the `axis` keyword + if ea_compat_axis or axis == 0: + return cls._concat_same_type(to_concat_eas) + else: + return cls._concat_same_type( + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) else: to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) result = np.concatenate(to_concat_arrs, axis=axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c49c9eddae62c..7c2247101ed86 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -30,6 +30,7 @@ from pandas._config import ( config, using_copy_on_write, + warn_copy_on_write, ) from pandas._libs import lib @@ -4397,7 +4398,7 @@ def _check_setitem_copy(self, t: str = "setting", force: bool_t = False): df.iloc[0:5]['group'] = 'a' """ - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): return # return early if the check is not needed @@ -12391,6 +12392,12 @@ def _inplace_method(self, other, op) -> Self: """ Wrap arithmetic method to operate inplace. """ + warn = True + if not PYPY and warn_copy_on_write(): + if sys.getrefcount(self) <= 5: + # we are probably in an inplace setitem context (e.g. df['a'] += 1) + warn = False + result = op(self, other) if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype: @@ -12398,7 +12405,7 @@ def _inplace_method(self, other, op) -> Self: # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager, # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace" self._mgr.setitem_inplace( # type: ignore[union-attr] - slice(None), result._values + slice(None), result._values, warn=warn ) return self diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 877b8edb32520..560285bd57a22 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -220,7 +220,10 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if len(indexes) == 1: result = indexes[0] if isinstance(result, list): - result = Index(sorted(result)) + if not sort: + result = Index(result) + else: + result = Index(sorted(result)) return result indexes, kind = _sanitize_and_check(indexes) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 677dd369fa4ee..fe366c7375c6a 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -307,7 +307,7 @@ def array(self) -> ArrayLike: # error: "SingleDataManager" has no attribute "arrays"; maybe "array" return self.arrays[0] # type: ignore[attr-defined] - def setitem_inplace(self, indexer, value) -> None: + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: """ Set values with indexer. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9f6ac1306d8a6..b109ce25a3e73 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -16,7 +16,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import ( internals as libinternals, @@ -97,6 +100,20 @@ from pandas.api.extensions import ExtensionArray +COW_WARNING_SETITEM_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +Currently, the mutation will also have effect on the object that shares data +with this object. For example, when setting a value in a Series that was +extracted from a column of a DataFrame, that DataFrame will also be updated: + + ser = df["col"] + ser[0] = 0 <--- in pandas 2, this also updates `df` + +In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never +modify another, and thus in the example above, `df` will not be changed. +""" + + class BaseBlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. @@ -1988,7 +2005,7 @@ def get_numeric_data(self, copy: bool = False) -> Self: def _can_hold_na(self) -> bool: return self._block._can_hold_na - def setitem_inplace(self, indexer, value) -> None: + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: """ Set values with indexer. @@ -1998,9 +2015,18 @@ def setitem_inplace(self, indexer, value) -> None: in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - if using_copy_on_write() and not self._has_no_reference(0): - self.blocks = (self._block.copy(),) - self._cache.clear() + using_cow = using_copy_on_write() + warn_cow = warn_copy_on_write() + if (using_cow or warn_cow) and not self._has_no_reference(0): + if using_cow: + self.blocks = (self._block.copy(),) + self._cache.clear() + elif warn and warn_cow: + warnings.warn( + COW_WARNING_SETITEM_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) super().setitem_inplace(indexer, value) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3d8b043a90bd1..95328d10c9d31 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -31,6 +31,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import precision_from_unit +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -337,6 +338,8 @@ def _return_parsed_timezone_results( tz_results = np.empty(len(result), dtype=object) non_na_timezones = set() for zone in unique(timezones): + # GH#50168 looping over unique timezones is faster than + # operating pointwise. mask = timezones == zone dta = DatetimeArray(result[mask]).tz_localize(zone) if utc: @@ -548,7 +551,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(unit) + mult, _ = precision_from_unit(abbrev_to_npy_unit(unit)) mask = np.isnan(arg) | (arg == iNaT) fvalues = (arg * mult).astype("f8", copy=False) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 775f3cd428677..82b4746aa57a5 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1068,7 +1068,7 @@ def mean(self, *args, update=None, update_times=None, **kwargs): result_kwargs["columns"] = self._selected_obj.columns else: result_kwargs["name"] = self._selected_obj.name - np_array = self._selected_obj.astype(np.float64).to_numpy() + np_array = self._selected_obj.astype(np.float64, copy=False).to_numpy() ewma_func = generate_online_numba_ewma_func( **get_jit_arguments(self.engine_kwargs) ) diff --git a/pandas/core/window/online.py b/pandas/core/window/online.py index f9e3122b304bc..29d1f740e021f 100644 --- a/pandas/core/window/online.py +++ b/pandas/core/window/online.py @@ -52,7 +52,7 @@ def online_ewma( exponentially weighted mean accounting minimum periods. """ result = np.empty(values.shape) - weighted_avg = values[0] + weighted_avg = values[0].copy() nobs = (~np.isnan(weighted_avg)).astype(np.int64) result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6def0024d9073..d5dc5eac8422e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -155,9 +155,11 @@ Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} - Use `object` to preserve data as stored in Excel and not interpret dtype. + Use `object` to preserve data as stored in Excel and not interpret dtype, + which will necessarily result in `object` dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. + If you use `None`, it will infer the dtype of each column based on the data. engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 277f64f636731..69b514da32857 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -228,8 +228,10 @@ def _get_cell_string_value(self, cell) -> str: """ from odf.element import Element from odf.namespaces import TEXTNS + from odf.office import Annotation from odf.text import S + office_annotation = Annotation().qname text_s = S().qname value = [] @@ -239,6 +241,8 @@ def _get_cell_string_value(self, cell) -> str: if fragment.qname == text_s: spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) + elif fragment.qname == office_annotation: + continue else: # recursive impl needed in case of nested fragments # with multiple spaces diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 91f7b2afec56c..3659e7247495d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,6 +30,7 @@ from pandas._config import ( config, get_option, + using_copy_on_write, using_pyarrow_string_dtype, ) @@ -3297,6 +3298,10 @@ def read( if len(dfs) > 0: out = concat(dfs, axis=1, copy=True) + if using_copy_on_write(): + # with CoW, concat ignores the copy keyword. Here, we still want + # to copy to enforce optimized column-major layout + out = out.copy() out = out.reindex(columns=items, copy=False) return out diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index a3d9de5e78afb..44829c598253d 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -147,6 +147,8 @@ def test_transform_axis_1_raises(): Series([1]).transform("sum", axis=1) +# TODO(CoW-warn) should not need to warn +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_apply_modify_traceback(): data = DataFrame( { diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 7e1e44d2119f9..3924d8e74e156 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -9,7 +9,7 @@ ) import pandas._testing as tm -pytestmark = td.skip_if_no("numba") +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] def test_numba_vs_python_noop(float_frame, apply_axis): diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 3d5556bdd2823..d462ce3d3187d 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -130,6 +132,15 @@ def test_astype_string_and_object_update_original( tm.assert_frame_equal(df2, df_orig) +def test_astype_string_copy_on_pickle_roundrip(): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(str) + tm.assert_series_equal(base, base_copy) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index ebb25bd5c57d3..ad55f9d561fe0 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -139,7 +139,9 @@ def test_subset_row_slice(backend, using_copy_on_write): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype): +def test_subset_column_slice( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype +): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset dtype_backend, DataFrame, _ = backend @@ -159,10 +161,14 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) - else: # we only get a warning in case of a single block - warn = SettingWithCopyWarning if single_block else None + # TODO(CoW-warn) should warn + warn = ( + SettingWithCopyWarning + if (single_block and not warn_copy_on_write) + else None + ) with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): subset.iloc[0, 0] = 0 @@ -303,7 +309,9 @@ def test_subset_iloc_rows_columns( [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], ids=["slice", "mask", "array"], ) -def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write): +def test_subset_set_with_row_indexer( + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write +): # Case: setting values with a row indexer on a viewing subset # subset[indexer] = value and subset.iloc[indexer] = value _, DataFrame, _ = backend @@ -318,7 +326,8 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on ): pytest.skip("setitem with labels selects on columns") - if using_copy_on_write: + # TODO(CoW-warn) should warn + if using_copy_on_write or warn_copy_on_write: indexer_si(subset)[indexer] = 0 else: # INFO iloc no longer raises warning since pandas 1.4 @@ -340,7 +349,7 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on tm.assert_frame_equal(df, df_orig) -def test_subset_set_with_mask(backend, using_copy_on_write): +def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): # Case: setting values with a mask on a viewing subset: subset[mask] = value _, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) @@ -349,7 +358,8 @@ def test_subset_set_with_mask(backend, using_copy_on_write): mask = subset > 3 - if using_copy_on_write: + # TODO(CoW-warn) should warn + if using_copy_on_write or warn_copy_on_write: subset[mask] = 0 else: with pd.option_context("chained_assignment", "warn"): @@ -370,7 +380,7 @@ def test_subset_set_with_mask(backend, using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_subset_set_column(backend, using_copy_on_write): +def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): # Case: setting a single column on a viewing subset -> subset[col] = value dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -382,7 +392,8 @@ def test_subset_set_column(backend, using_copy_on_write): else: arr = pd.array([10, 11], dtype="Int64") - if using_copy_on_write: + # TODO(CoW-warn) should warn + if using_copy_on_write or warn_copy_on_write: subset["a"] = arr else: with pd.option_context("chained_assignment", "warn"): @@ -472,7 +483,7 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_columns(backend, using_copy_on_write, dtype): +def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype): # Case: setting multiple columns on a viewing subset # -> subset[[col1, col2]] = value dtype_backend, DataFrame, _ = backend @@ -482,7 +493,8 @@ def test_subset_set_columns(backend, using_copy_on_write, dtype): df_orig = df.copy() subset = df[1:3] - if using_copy_on_write: + # TODO(CoW-warn) should warn + if using_copy_on_write or warn_copy_on_write: subset[["a", "c"]] = 0 else: with pd.option_context("chained_assignment", "warn"): @@ -879,7 +891,9 @@ def test_del_series(backend): # Accessing column as Series -def test_column_as_series(backend, using_copy_on_write, using_array_manager): +def test_column_as_series( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): # Case: selecting a single column now also uses Copy-on-Write dtype_backend, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -892,10 +906,14 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager): if using_copy_on_write or using_array_manager: s[0] = 0 else: - warn = SettingWithCopyWarning if dtype_backend == "numpy" else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): + if warn_copy_on_write: + with tm.assert_cow_warning(): s[0] = 0 + else: + warn = SettingWithCopyWarning if dtype_backend == "numpy" else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + s[0] = 0 expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) @@ -910,7 +928,7 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager): def test_column_as_series_set_with_upcast( - backend, using_copy_on_write, using_array_manager + backend, using_copy_on_write, using_array_manager, warn_copy_on_write ): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent @@ -921,10 +939,11 @@ def test_column_as_series_set_with_upcast( s = df["a"] if dtype_backend == "nullable": - with pytest.raises(TypeError, match="Invalid value"): - s[0] = "foo" + with tm.assert_cow_warning(warn_copy_on_write): + with pytest.raises(TypeError, match="Invalid value"): + s[0] = "foo" expected = Series([1, 2, 3], name="a") - elif using_copy_on_write or using_array_manager: + elif using_copy_on_write or warn_copy_on_write or using_array_manager: with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): s[0] = "foo" expected = Series(["foo", 2, 3], dtype=object, name="a") @@ -962,7 +981,12 @@ def test_column_as_series_set_with_upcast( ids=["getitem", "loc", "iloc"], ) def test_column_as_series_no_item_cache( - request, backend, method, using_copy_on_write, using_array_manager + request, + backend, + method, + using_copy_on_write, + warn_copy_on_write, + using_array_manager, ): # Case: selecting a single column (which now also uses Copy-on-Write to protect # the view) should always give a new object (i.e. not make use of a cache) @@ -979,7 +1003,8 @@ def test_column_as_series_no_item_cache( else: assert s1 is s2 - if using_copy_on_write or using_array_manager: + # TODO(CoW-warn) should warn + if using_copy_on_write or warn_copy_on_write or using_array_manager: s1.iloc[0] = 0 else: warn = SettingWithCopyWarning if dtype_backend == "numpy" else None diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 45bfc6a6fcf9b..60ab21f48e910 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1651,7 +1651,7 @@ def test_isetitem_frame(using_copy_on_write): @pytest.mark.parametrize("key", ["a", ["a"]]) -def test_get(using_copy_on_write, key): +def test_get(using_copy_on_write, warn_copy_on_write, key): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() @@ -1665,7 +1665,12 @@ def test_get(using_copy_on_write, key): else: # for non-CoW it depends on whether we got a Series or DataFrame if it # is a view or copy or triggers a warning or not - warn = SettingWithCopyWarning if isinstance(key, list) else None + # TODO(CoW) should warn + warn = ( + (None if warn_copy_on_write else SettingWithCopyWarning) + if isinstance(key, list) + else None + ) with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0] = 0 @@ -1680,7 +1685,9 @@ def test_get(using_copy_on_write, key): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype): +def test_xs( + using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype +): single_block = (dtype == "int64") and not using_array_manager is_view = single_block or (using_array_manager and axis == 1) df = DataFrame( @@ -1695,8 +1702,12 @@ def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype): elif using_copy_on_write: assert result._mgr._has_no_reference(0) + # TODO(CoW) should warn in case of is_view if using_copy_on_write or is_view: result.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(single_block): + result.iloc[0] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): @@ -1710,7 +1721,9 @@ def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)]) -def test_xs_multiindex(using_copy_on_write, using_array_manager, key, level, axis): +def test_xs_multiindex( + using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis +): arr = np.arange(18).reshape(6, 3) index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"]) df = DataFrame(arr, index=index, columns=list("abc")) @@ -1725,8 +1738,9 @@ def test_xs_multiindex(using_copy_on_write, using_array_manager, key, level, axi get_array(df, df.columns[0]), get_array(result, result.columns[0]) ) + # TODO(CoW) should warn warn = ( - SettingWithCopyWarning + (None if warn_copy_on_write else SettingWithCopyWarning) if not using_copy_on_write and not using_array_manager else None ) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 5016b57bdd0b7..bc3b939734534 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -140,3 +140,17 @@ def test_setitem_series_column_midx_broadcasting(using_copy_on_write): assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) if using_copy_on_write: assert df._mgr._has_no_reference(0) + + +def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + # this should not raise any warning + with tm.assert_produces_warning(None): + df["a"] += 1 + + # when it is not in a chain, then it should produce a warning + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = df["a"] + with tm.assert_cow_warning(warn_copy_on_write): + ser += 1 diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index a94f7de283d01..30b10541c8f66 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -215,6 +215,6 @@ def using_copy_on_write() -> bool: Fixture to check if Copy-on-Write is enabled. """ return ( - options.mode.copy_on_write + options.mode.copy_on_write is True and _get_option("mode.data_manager", silent=True) == "block" ) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index d78924ff9d046..845174bbf600e 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -200,3 +200,24 @@ def test_from_dict_orient_invalid(self): ) with pytest.raises(ValueError, match=msg): DataFrame.from_dict({"foo": 1, "baz": 3, "bar": 2}, orient="abc") + + def test_from_dict_order_with_single_column(self): + data = { + "alpha": { + "value2": 123, + "value1": 532, + "animal": 222, + "plant": False, + "name": "test", + } + } + result = DataFrame.from_dict( + data, + orient="columns", + ) + expected = DataFrame( + [[123], [532], [222], [False], ["test"]], + index=["value2", "value1", "animal", "plant", "name"], + columns=["alpha"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index de8df15a9d747..ba8d713df5787 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -288,7 +288,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write): + def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -324,7 +324,7 @@ def test_setitem(self, float_frame, using_copy_on_write): smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: # With CoW, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] else: @@ -1295,7 +1295,7 @@ def test_setting_mismatched_na_into_nullable_fails( ): # GH#44514 don't cast mismatched nulls to pd.NA df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) - ser = df["A"] + ser = df["A"].copy() arr = ser._values msg = "|".join( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 7db8f51afe291..1317e3767efba 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1282,7 +1282,9 @@ def test_setitem_not_operating_inplace(self, value, set_value, indexer): tm.assert_frame_equal(view, expected) @td.skip_array_manager_invalid_test - def test_setitem_column_update_inplace(self, using_copy_on_write): + def test_setitem_column_update_inplace( + self, using_copy_on_write, warn_copy_on_write + ): # https://github.com/pandas-dev/pandas/issues/47172 labels = [f"c{i}" for i in range(10)] @@ -1290,8 +1292,9 @@ def test_setitem_column_update_inplace(self, using_copy_on_write): values = df._mgr.blocks[0].values if not using_copy_on_write: - for label in df.columns: - df[label][label] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + for label in df.columns: + df[label][label] = 1 # diagonal values all updated assert np.all(values[np.arange(10), np.arange(10)] == 1) diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 492dd387971c8..772738ae460b9 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -198,14 +198,17 @@ def test_xs_level_eq_2(self): tm.assert_frame_equal(result, expected) def test_xs_setting_with_copy_error( - self, multiindex_dataframe_random_data, using_copy_on_write + self, + multiindex_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, ): # this is a copy in 0.14 df = multiindex_dataframe_random_data df_orig = df.copy() result = df.xs("two", level="second") - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: result[:] = 10 else: # setting this will give a SettingWithCopyError @@ -216,14 +219,14 @@ def test_xs_setting_with_copy_error( tm.assert_frame_equal(df, df_orig) def test_xs_setting_with_copy_error_multiple( - self, four_level_index_dataframe, using_copy_on_write + self, four_level_index_dataframe, using_copy_on_write, warn_copy_on_write ): # this is a copy in 0.14 df = four_level_index_dataframe df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: result[:] = 10 else: # setting this will give a SettingWithCopyError diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index a6caafb48ca4d..eac10d307c61c 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -382,7 +382,12 @@ def test_astype_from_object_to_datetime_unit(self, unit): ["2017-01-01", "2017-01-02", "2017-02-03"], ] df = DataFrame(vals, dtype=object) - with pytest.raises(TypeError, match="Cannot cast"): + msg = ( + rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. " + r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', " + r"'datetime64\[ns\]' or DatetimeTZDtype" + ) + with pytest.raises(ValueError, match=msg): df.astype(f"M8[{unit}]") @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/frame/methods/test_info.py similarity index 93% rename from pandas/tests/io/formats/test_info.py rename to pandas/tests/frame/methods/test_info.py index 6c3bf01cb1857..7d9e0fe90f44c 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -71,6 +71,7 @@ def test_info_categorical_column_smoke_test(): "float_frame", "datetime_frame", "duplicate_columns_frame", + "float_string_frame", ], ) def test_info_smoke_test(fixture_func_name, request): @@ -80,6 +81,19 @@ def test_info_smoke_test(fixture_func_name, request): result = buf.getvalue().splitlines() assert len(result) > 10 + buf = StringIO() + frame.info(buf=buf, verbose=False) + + +def test_info_smoke_test2(float_frame): + # pretty useless test, used to be mixed into the repr tests + buf = StringIO() + float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) + + # no columns or index + DataFrame().info(buf=buf) + @pytest.mark.parametrize( "num_columns, max_info_columns, verbose", @@ -525,3 +539,27 @@ def test_info_compute_numba(): df.info() expected = buf.getvalue() assert result == expected + + +@pytest.mark.parametrize( + "row, columns, show_counts, result", + [ + [20, 20, None, True], + [20, 20, True, True], + [20, 20, False, False], + [5, 5, None, False], + [5, 5, True, False], + [5, 5, False, False], + ], +) +def test_info_show_counts(row, columns, show_counts, result): + # Explicit cast to float to avoid implicit cast when setting nan + df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"}) + df.iloc[1, 1] = np.nan + + with option_context( + "display.max_info_rows", row, "display.max_info_columns", columns + ): + with StringIO() as buf: + df.info(buf=buf, show_counts=show_counts) + assert ("non-null" in buf.getvalue()) is result diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index fb70e656814f9..f35cb21f378bb 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -50,13 +50,12 @@ def test_rename(self, float_frame): # index data = {"A": {"foo": 0, "bar": 1}} - # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={"foo": "bar", "bar": "foo"}) - tm.assert_index_equal(renamed.index, Index(["foo", "bar"])) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"])) renamed = df.rename(index=str.upper) - tm.assert_index_equal(renamed.index, Index(["BAR", "FOO"])) + tm.assert_index_equal(renamed.index, Index(["FOO", "BAR"])) # have to pass something with pytest.raises(TypeError, match="must pass an index to rename"): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c59edbae05739..b0e4bc95b3fb8 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -22,15 +22,13 @@ ) import pandas._testing as tm from pandas.core.computation import expressions as expr -from pandas.core.computation.expressions import _MIN_ELEMENTS from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) -from pandas.util.version import Version -@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) +@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"]) def switch_numexpr_min_elements(request, monkeypatch): with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", request.param) @@ -499,34 +497,6 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.parametrize("opname", ["floordiv", "pow"]) - def test_floordiv_axis0_numexpr_path(self, opname, request): - # case that goes through numexpr and has to fall back to masked_arith_op - ne = pytest.importorskip("numexpr") - if ( - Version(ne.__version__) >= Version("2.8.7") - and opname == "pow" - and "python" in request.node.callspec.id - ): - request.applymarker( - pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") - ) - - op = getattr(operator, opname) - - arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 - df = DataFrame(arr) - df["C"] = 1.0 - - ser = df[0] - result = getattr(df, opname)(ser, axis=0) - - expected = DataFrame({col: op(df[col], ser) for col in df.columns}) - tm.assert_frame_equal(result, expected) - - result2 = getattr(df, opname)(ser.values, axis=0) - tm.assert_frame_equal(result2, expected) - def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) @@ -2029,14 +1999,15 @@ def test_arith_list_of_arraylike_raise(to_add): to_add + df -def test_inplace_arithmetic_series_update(using_copy_on_write): +def test_inplace_arithmetic_series_update(using_copy_on_write, warn_copy_on_write): # https://github.com/pandas-dev/pandas/issues/36373 df = DataFrame({"A": [1, 2, 3]}) df_orig = df.copy() series = df["A"] vals = series._values - series += 1 + with tm.assert_cow_warning(warn_copy_on_write): + series += 1 if using_copy_on_write: assert series._values is not vals tm.assert_frame_equal(df, df_orig) @@ -2132,3 +2103,13 @@ def test_enum_column_equality(): expected = Series([True, True, True], name=Cols.col1) tm.assert_series_equal(result, expected) + + +def test_mixed_col_index_dtype(): + # GH 47382 + df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) + df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) + df1.columns = df2.columns.astype("string") + result = df1 + df2 + expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8c53ffdd493d6..295f457bcaeab 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -334,7 +334,7 @@ def test_is_mixed_type(self, float_frame, float_string_frame): assert not float_frame._is_mixed_type assert float_string_frame._is_mixed_type - def test_stale_cached_series_bug_473(self, using_copy_on_write): + def test_stale_cached_series_bug_473(self, using_copy_on_write, warn_copy_on_write): # this is chained, but ok with option_context("chained_assignment", None): Y = DataFrame( @@ -347,6 +347,9 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write): if using_copy_on_write: with tm.raises_chained_assignment_error(): Y["g"]["c"] = np.nan + elif warn_copy_on_write: + with tm.assert_cow_warning(): + Y["g"]["c"] = np.nan else: Y["g"]["c"] = np.nan repr(Y) @@ -357,13 +360,16 @@ def test_stale_cached_series_bug_473(self, using_copy_on_write): else: assert pd.isna(Y["g"]["c"]) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_strange_column_corruption_issue(self, using_copy_on_write): # TODO(wesm): Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan wasCol = {} - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): for i, dt in enumerate(df.index): for col in range(100, 200): if col not in wasCol: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 35bc23601eaf4..5530fa336971c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -293,23 +293,27 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): + def test_constructor_dtype_nocast_view_dataframe( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) if using_copy_on_write: should_be_view.iloc[0, 0] = 99 assert df.values[0, 0] == 1 else: - should_be_view[0][0] = 99 + with tm.assert_cow_warning(warn_copy_on_write): + should_be_view[0][0] = 99 assert df.values[0, 0] == 99 def test_constructor_dtype_nocast_view_2d_array( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): df = DataFrame([[1, 2], [3, 4]], dtype="int64") if not using_array_manager and not using_copy_on_write: should_be_view = DataFrame(df.values, dtype=df[0].dtype) - should_be_view[0][0] = 97 + with tm.assert_cow_warning(warn_copy_on_write): + should_be_view[0][0] = 97 assert df.values[0, 0] == 97 else: # INFO(ArrayManager) DataFrame(ndarray) doesn't necessarily preserve diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr.py similarity index 89% rename from pandas/tests/frame/test_repr_info.py rename to pandas/tests/frame/test_repr.py index 23aef9ba7d18d..7c0e374c50bab 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr.py @@ -25,7 +25,7 @@ import pandas.io.formats.format as fmt -class TestDataFrameReprInfoEtc: +class TestDataFrameRepr: def test_repr_bytes_61_lines(self): # GH#12857 lets = list("ACDEFGHIJKLMNOP") @@ -141,11 +141,8 @@ def test_repr_empty(self): repr(frame) def test_repr_mixed(self, float_string_frame): - buf = StringIO() - # mixed repr(float_string_frame) - float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow def test_repr_mixed_big(self): @@ -162,26 +159,11 @@ def test_repr_mixed_big(self): repr(biggie) - def test_repr(self, float_frame): - buf = StringIO() - - # small one - repr(float_frame) - float_frame.info(verbose=False, buf=buf) - - # even smaller - float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) - float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) - - # exhausting cases in DataFrame.info - + def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) repr(no_index) - # no columns or index - DataFrame().info(buf=buf) - df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) assert "\t" not in repr(df) assert "\r" not in repr(df) @@ -204,7 +186,7 @@ def test_repr_big(self): biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) repr(biggie) - def test_repr_unsortable(self, float_frame): + def test_repr_unsortable(self): # columns are not sortable unsortable = DataFrame( @@ -218,6 +200,9 @@ def test_repr_unsortable(self, float_frame): ) repr(unsortable) + def test_repr_float_frame_options(self, float_frame): + repr(float_frame) + fmt.set_option("display.precision", 3) repr(float_frame) @@ -257,7 +242,7 @@ def test_str_to_bytes_raises(self): with pytest.raises(TypeError, match=msg): bytes(df) - def test_very_wide_info_repr(self): + def test_very_wide_repr(self): df = DataFrame( np.random.default_rng(2).standard_normal((10, 20)), columns=np.array(["a" * 10] * 20, dtype=object), @@ -437,20 +422,6 @@ def test_to_records_with_inf_record(self): result = repr(df) assert result == expected - def test_masked_ea_with_formatter(self): - # GH#39336 - df = DataFrame( - { - "a": Series([0.123456789, 1.123456789], dtype="Float64"), - "b": Series([1, 2], dtype="Int64"), - } - ) - result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) - expected = """ a b -0 0.12 1.00 -1 1.12 2.00""" - assert result == expected - def test_repr_ea_columns(self, any_string_dtype): # GH#54797 pytest.importorskip("pyarrow") @@ -461,3 +432,39 @@ def test_repr_ea_columns(self, any_string_dtype): 1 2 5 2 3 6""" assert repr(df) == expected + + +@pytest.mark.parametrize( + "data,output", + [ + ([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]), + ([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), + ([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), + ([-1.23j, complex("nan"), -1], ["-0.00-1.23j", " NaN+0.00j", "-1.00+0.00j"]), + ([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", " NaN+0.00j", " 1.23+0.00j"]), + ( + [-1.23j, complex(np.nan, np.nan), 1], + ["-0.00-1.23j", " NaN+ NaNj", " 1.00+0.00j"], + ), + ( + [-1.23j, complex(1.2, np.nan), 1], + ["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"], + ), + ( + [-1.23j, complex(np.nan, -1.2), 1], + ["-0.00-1.23j", " NaN-1.20j", " 1.00+0.00j"], + ), + ], +) +@pytest.mark.parametrize("as_frame", [True, False]) +def test_repr_with_complex_nans(data, output, as_frame): + # GH#53762, GH#53841 + obj = Series(np.array(data)) + if as_frame: + obj = obj.to_frame(name="val") + reprs = [f"{i} {val}" for i, val in enumerate(output)] + expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs) + else: + reprs = [f"{i} {val}" for i, val in enumerate(output)] + expected = "\n".join(reprs) + "\ndtype: complex128" + assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}" diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e041eff697718..2e7e8eba270c0 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1,5 +1,4 @@ from datetime import datetime -from io import StringIO import itertools import re @@ -1771,21 +1770,21 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx, future_stack): "ignore:The previous implementation of stack is deprecated" ) def test_unstack_odd_failure(self, future_stack): - data = """day,time,smoker,sum,len -Fri,Dinner,No,8.25,3. -Fri,Dinner,Yes,27.03,9 -Fri,Lunch,No,3.0,1 -Fri,Lunch,Yes,13.68,6 -Sat,Dinner,No,139.63,45 -Sat,Dinner,Yes,120.77,42 -Sun,Dinner,No,180.57,57 -Sun,Dinner,Yes,66.82,19 -Thu,Dinner,No,3.0,1 -Thu,Lunch,No,117.32,44 -Thu,Lunch,Yes,51.51,17""" - - df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) - + mi = MultiIndex.from_arrays( + [ + ["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3, + ["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2, + ["No", "Yes"] * 4 + ["No", "No", "Yes"], + ], + names=["day", "time", "smoker"], + ) + df = DataFrame( + { + "sum": np.arange(11, dtype="float64"), + "len": np.arange(11, dtype="float64"), + }, + index=mi, + ) # it works, #2100 result = df.unstack(2) @@ -2186,7 +2185,7 @@ def __init__(self, *args, **kwargs) -> None: with monkeypatch.context() as m: m.setattr(reshape_lib, "_Unstacker", MockUnstacker) df = DataFrame( - np.random.default_rng(2).standard_normal((2**16, 2)), + np.zeros((2**16, 2)), index=[np.arange(2**16), np.arange(2**16)], ) msg = "The following operation may generate" diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 8e657f197ca1e..55bfefaeb53a9 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -524,6 +524,8 @@ def test_subclassed_wide_to_long(self): tm.assert_frame_equal(long_frame, expected) + # TODO(CoW-warn) should not need to warn + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_subclassed_apply(self): # GH 19822 diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 37f9d26284f52..b82908ef2aa21 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -4,7 +4,6 @@ and proper parameter handling """ -from itertools import product import numpy as np import pytest @@ -46,7 +45,6 @@ def tests_value_counts_index_names_category_column(): tm.assert_series_equal(result, expected) -# our starting frame def seed_df(seed_nans, n, m): days = date_range("2015-08-24", periods=10) @@ -70,29 +68,32 @@ def seed_df(seed_nans, n, m): return frame -# create input df, keys, and the bins -binned = [] -ids = [] -for seed_nans in [True, False]: - for n, m in product((100, 1000), (5, 20)): - df = seed_df(seed_nans, n, m) - bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) - keys = "1st", "2nd", ["1st", "2nd"] - for k, b in product(keys, bins): - binned.append((df, k, b, n, m)) - ids.append(f"{k}-{n}-{m}") - - @pytest.mark.slow -@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +@pytest.mark.parametrize("seed_nans", [True, False]) +@pytest.mark.parametrize("num_rows", [10, 50]) +@pytest.mark.parametrize("max_int", [5, 20]) +@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr) +@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr) @pytest.mark.parametrize("isort", [True, False]) @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_value_counts( - df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna + seed_nans, + num_rows, + max_int, + keys, + bins, + isort, + normalize, + name, + sort, + ascending, + dropna, ): + df = seed_df(seed_nans, num_rows, max_int) + def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) df.index = MultiIndex.from_arrays(arr, names=df.index.names) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5331b2e2c5d81..2f2648b9293c5 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -2,7 +2,6 @@ date, datetime, ) -from io import StringIO import numpy as np import pytest @@ -38,39 +37,80 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_issues(): +def test_apply_index_date(): # GH 5788 - - s = """2011.05.16,00:00,1.40893 -2011.05.16,01:00,1.40760 -2011.05.16,02:00,1.40750 -2011.05.16,03:00,1.40649 -2011.05.17,02:00,1.40893 -2011.05.17,03:00,1.40760 -2011.05.17,04:00,1.40750 -2011.05.17,05:00,1.40649 -2011.05.18,02:00,1.40893 -2011.05.18,03:00,1.40760 -2011.05.18,04:00,1.40750 -2011.05.18,05:00,1.40649""" - - df = pd.read_csv( - StringIO(s), - header=None, - names=["date", "time", "value"], - parse_dates=[["date", "time"]], + ts = [ + "2011-05-16 00:00", + "2011-05-16 01:00", + "2011-05-16 02:00", + "2011-05-16 03:00", + "2011-05-17 02:00", + "2011-05-17 03:00", + "2011-05-17 04:00", + "2011-05-17 05:00", + "2011-05-18 02:00", + "2011-05-18 03:00", + "2011-05-18 04:00", + "2011-05-18 05:00", + ] + df = DataFrame( + { + "value": [ + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + ], + }, + index=Index(pd.to_datetime(ts), name="date_time"), ) - df = df.set_index("date_time") - expected = df.groupby(df.index.date).idxmax() result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) tm.assert_frame_equal(result, expected) + +def test_apply_index_date_object(): # GH 5789 # don't auto coerce dates - df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) + ts = [ + "2011-05-16 00:00", + "2011-05-16 01:00", + "2011-05-16 02:00", + "2011-05-16 03:00", + "2011-05-17 02:00", + "2011-05-17 03:00", + "2011-05-17 04:00", + "2011-05-17 05:00", + "2011-05-18 02:00", + "2011-05-18 03:00", + "2011-05-18 04:00", + "2011-05-18 05:00", + ] + df = DataFrame([row.split() for row in ts], columns=["date", "time"]) + df["value"] = [ + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + ] exp_idx = Index( - ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 66e5f2bf133f9..8a878936946f8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -47,7 +47,7 @@ def test_groupby_std_datetimelike(): ser = Series(tdi) ser[::5] *= 2 # get different std for different groups - df = ser.to_frame("A") + df = ser.to_frame("A").copy() df["B"] = ser + Timestamp(0) df["C"] = ser + Timestamp(0, tz="UTC") diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 35ad8e3f5dc61..f48926e4d0c77 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1,6 +1,5 @@ import builtins import datetime as dt -from io import StringIO from string import ascii_lowercase import numpy as np @@ -589,13 +588,18 @@ def test_min_empty_string_dtype(func): def test_max_nan_bug(): - raw = """,Date,app,File --04-23,2013-04-23 00:00:00,,log080001.log --05-06,2013-05-06 00:00:00,,log.log --05-07,2013-05-07 00:00:00,OE,xlsx""" - - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - df = pd.read_csv(StringIO(raw), parse_dates=[0]) + df = DataFrame( + { + "Unnamed: 0": ["-04-23", "-05-06", "-05-07"], + "Date": [ + "2013-04-23 00:00:00", + "2013-05-06 00:00:00", + "2013-05-07 00:00:00", + ], + "app": Series([np.nan, np.nan, "OE"]), + "File": ["log080001.log", "log.log", "xlsx"], + } + ) gb = df.groupby("Date") r = gb[["File"]].max() e = gb["File"].max().to_frame() diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 76eab9690b62e..f392253ac310f 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -5,7 +5,6 @@ datetime, timedelta, ) -from io import StringIO import numpy as np import pytest @@ -607,14 +606,26 @@ def test_frame_datetime64_handling_groupby(self): def test_groupby_multi_timezone(self): # combining multiple / different timezones yields UTC + df = DataFrame( + { + "value": range(5), + "date": [ + "2000-01-28 16:47:00", + "2000-01-29 16:48:00", + "2000-01-30 16:49:00", + "2000-01-31 16:50:00", + "2000-01-01 16:50:00", + ], + "tz": [ + "America/Chicago", + "America/Chicago", + "America/Los_Angeles", + "America/Chicago", + "America/New_York", + ], + } + ) - data = """0,2000-01-28 16:47:00,America/Chicago -1,2000-01-29 16:48:00,America/Chicago -2,2000-01-30 16:49:00,America/Los_Angeles -3,2000-01-31 16:50:00,America/Chicago -4,2000-01-01 16:50:00,America/New_York""" - - df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) result = df.groupby("tz", group_keys=False).date.apply( lambda x: pd.to_datetime(x).dt.tz_localize(x.name) ) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 53fbd2edc8194..974ae2f1ad17b 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1,6 +1,4 @@ """ test with the .transform """ -from io import StringIO - import numpy as np import pytest @@ -337,22 +335,28 @@ def test_transform_datetime_to_numeric(): def test_transform_casting(): # 13046 - data = """ - idx A ID3 DATETIME - 0 B-028 b76cd912ff "2014-10-08 13:43:27" - 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" - 2 B-076 1a682034f8 "2014-10-08 14:29:01" - 3 B-023 b76cd912ff "2014-10-08 18:39:34" - 4 B-023 f88g8d7sds "2014-10-08 18:40:18" - 5 B-033 b76cd912ff "2014-10-08 18:44:30" - 6 B-032 b76cd912ff "2014-10-08 18:46:00" - 7 B-037 b76cd912ff "2014-10-08 18:52:15" - 8 B-046 db959faf02 "2014-10-08 18:59:59" - 9 B-053 b76cd912ff "2014-10-08 19:17:48" - 10 B-065 b76cd912ff "2014-10-08 19:21:38" - """ - df = pd.read_csv( - StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"] + times = [ + "13:43:27", + "14:26:19", + "14:29:01", + "18:39:34", + "18:40:18", + "18:44:30", + "18:46:00", + "18:52:15", + "18:59:59", + "19:17:48", + "19:21:38", + ] + df = DataFrame( + { + "A": [f"B-{i}" for i in range(11)], + "ID3": np.take( + ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1] + ), + "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]), + }, + index=pd.RangeIndex(11, name="idx"), ) result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 08ec11c87b623..ef86e7800dbb7 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1013,6 +1013,40 @@ def test_dti_convert_datetime_list(self, tzstr): dr2 = DatetimeIndex(list(dr), name="foo", freq="D") tm.assert_index_equal(dr, dr2) + def test_dti_constructor_with_non_nano_dtype(self): + # GH#55756 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + # NB: the 2500 is interpreted as nanoseconds and rounded *down* + # to 2 microseconds + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + result = DatetimeIndex(vals, dtype=dtype) + exp_arr = np.array([ts.asm8, vals[1], 2], dtype=dtype) + expected = DatetimeIndex(exp_arr, dtype=dtype) + tm.assert_index_equal(result, expected) + + result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) + tm.assert_index_equal(result2, expected) + + def test_dti_constructor_with_non_nano_now_today(self): + # GH#55756 + now = Timestamp.now() + today = Timestamp.today() + result = DatetimeIndex(["now", "today"], dtype="M8[s]") + assert result.dtype == "M8[s]" + + # result may not exactly match [now, today] so we'll test it up to a tolerance. + # (it *may* match exactly due to rounding) + tolerance = pd.Timedelta(microseconds=1) + + diff0 = result[0] - now.as_unit("s") + assert diff0 >= pd.Timedelta(0) + assert diff0 < tolerance + + diff1 = result[1] - today.as_unit("s") + assert diff1 >= pd.Timedelta(0) + assert diff1 < tolerance + class TestTimeSeries: def test_dti_constructor_preserve_dti_freq(self): diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index bbe94824eefa1..1736f65e355fb 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -229,3 +229,13 @@ def test_tuple_width(self, wide_multi_index): ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" assert result == expected + + def test_multiindex_long_element(self): + # Non-regression test towards GH#52960 + data = MultiIndex.from_tuples([("c" * 62,)]) + + expected = ( + "MultiIndex([('cccccccccccccccccccccccccccccccccccccccc" + "cccccccccccccccccccccc',)],\n )" + ) + assert str(data) == expected diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 04ab2020b4c7a..0c32149124ac6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1,6 +1,5 @@ from collections import defaultdict from datetime import datetime -from io import StringIO import math import operator import re @@ -1174,13 +1173,21 @@ def test_groupby(self): def test_equals_op_multiindex(self, mi, expected): # GH9785 # test comparisons of multiindex - df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + df = DataFrame( + [3, 6], + columns=["c"], + index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]), + ) result = df.index == mi tm.assert_numpy_array_equal(result, expected) def test_equals_op_multiindex_identify(self): - df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + df = DataFrame( + [3, 6], + columns=["c"], + index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]), + ) result = df.index == df.index expected = np.array([True, True]) @@ -1194,7 +1201,11 @@ def test_equals_op_multiindex_identify(self): ], ) def test_equals_op_mismatched_multiindex_raises(self, index): - df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) + df = DataFrame( + [3, 6], + columns=["c"], + index=MultiIndex.from_arrays([[1, 4], [2, 5]], names=["a", "b"]), + ) with pytest.raises(ValueError, match="Lengths must match"): df.index == index @@ -1454,6 +1465,20 @@ def test_is_monotonic_na(self, index): assert index._is_strictly_monotonic_increasing is False assert index._is_strictly_monotonic_decreasing is False + @pytest.mark.parametrize("dtype", ["f8", "m8[ns]", "M8[us]"]) + @pytest.mark.parametrize("unique_first", [True, False]) + def test_is_monotonic_unique_na(self, dtype, unique_first): + # GH 55755 + index = Index([None, 1, 1], dtype=dtype) + if unique_first: + assert index.is_unique is False + assert index.is_monotonic_increasing is False + assert index.is_monotonic_decreasing is False + else: + assert index.is_monotonic_increasing is False + assert index.is_monotonic_decreasing is False + assert index.is_unique is False + def test_int_name_format(self, frame_or_series): index = Index(["a", "b", "c"], name=0) result = frame_or_series(list(range(3)), index=index) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 7adc697610f3c..c3a2c582854f3 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -12,7 +12,7 @@ import pandas._testing as tm -def test_detect_chained_assignment(using_copy_on_write): +def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] @@ -32,6 +32,9 @@ def test_detect_chained_assignment(using_copy_on_write): if using_copy_on_write: with tm.raises_chained_assignment_error(): zed["eyes"]["right"].fillna(value=555, inplace=True) + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + zed["eyes"]["right"].fillna(value=555, inplace=True) else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index c8b10f72c9ad9..873c4e3e60f4c 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -698,10 +698,19 @@ def test_loc_mi_with_level1_named_0(): tm.assert_series_equal(result, expected) -def test_getitem_str_slice(datapath): +def test_getitem_str_slice(): # GH#15928 - path = datapath("reshape", "merge", "data", "quotes2.csv") - df = pd.read_csv(path, parse_dates=["time"]) + df = DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"], + ], + columns="time,ticker,bid,ask".split(","), + ) df2 = df.set_index(["ticker", "time"]).sort_index() res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 51b94c3beeb6f..ec787a475575d 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -175,7 +175,7 @@ def test_multiindex_setitem2(self): ) expected = df_orig.copy() - expected.iloc[[0, 2, 3]] *= 2 + expected.iloc[[0, 1, 3]] *= 2 idx = pd.IndexSlice df = df_orig.copy() @@ -273,16 +273,20 @@ def test_groupby_example(self): new_vals = np.arange(df2.shape[0]) df.loc[name, "new_col"] = new_vals - def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): + def test_series_setitem( + self, multiindex_year_month_day_dataframe_random_data, warn_copy_on_write + ): ymd = multiindex_year_month_day_dataframe_random_data s = ymd["A"] - s[2000, 3] = np.nan + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3] = np.nan assert isna(s.values[42:65]).all() assert notna(s.values[:42]).all() assert notna(s.values[65:]).all() - s[2000, 3, 10] = np.nan + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3, 10] = np.nan assert isna(s.iloc[49]) with pytest.raises(KeyError, match="49"): @@ -527,13 +531,17 @@ def test_frame_setitem_view_direct( def test_frame_setitem_copy_raises( - multiindex_dataframe_random_data, using_copy_on_write + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T if using_copy_on_write: with tm.raises_chained_assignment_error(): df["foo"]["one"] = 2 + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df["foo"]["one"] = 2 else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): @@ -541,7 +549,7 @@ def test_frame_setitem_copy_raises( def test_frame_setitem_copy_no_write( - multiindex_dataframe_random_data, using_copy_on_write + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): frame = multiindex_dataframe_random_data.T expected = frame @@ -549,6 +557,10 @@ def test_frame_setitem_copy_no_write( if using_copy_on_write: with tm.raises_chained_assignment_error(): df["foo"]["one"] = 2 + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df["foo"]["one"] = 2 else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 7353b5ef76ba3..562638f7058c6 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -201,7 +201,7 @@ def test_setitem_chained_setfault(self, using_copy_on_write): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self, using_copy_on_write): + def test_detect_chained_assignment(self, using_copy_on_write, warn_copy_on_write): with option_context("chained_assignment", "raise"): # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) @@ -218,13 +218,15 @@ def test_detect_chained_assignment(self, using_copy_on_write): df["A"][1] = -6 tm.assert_frame_equal(df, df_original) else: - df["A"][0] = -5 - df["A"][1] = -6 + with tm.assert_cow_warning(warn_copy_on_write): + df["A"][0] = -5 + with tm.assert_cow_warning(warn_copy_on_write): + df["A"][1] = -6 tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_raises( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): # test with the chaining df = DataFrame( @@ -242,6 +244,11 @@ def test_detect_chained_assignment_raises( with tm.raises_chained_assignment_error(): df["A"][1] = -6 tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + with tm.assert_cow_warning(): + df["A"][0] = -5 + with tm.assert_cow_warning(): + df["A"][1] = np.nan elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): df["A"][0] = -5 @@ -260,7 +267,9 @@ def test_detect_chained_assignment_raises( tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self, using_copy_on_write): + def test_detect_chained_assignment_fails( + self, using_copy_on_write, warn_copy_on_write + ): # Using a copy (the chain), fails df = DataFrame( { @@ -272,12 +281,18 @@ def test_detect_chained_assignment_fails(self, using_copy_on_write): if using_copy_on_write: with tm.raises_chained_assignment_error(): df.loc[0]["A"] = -5 + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df.loc[0]["A"] = -5 else: with pytest.raises(SettingWithCopyError, match=msg): df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self, using_copy_on_write): + def test_detect_chained_assignment_doc_example( + self, using_copy_on_write, warn_copy_on_write + ): # Doc example df = DataFrame( { @@ -287,24 +302,27 @@ def test_detect_chained_assignment_doc_example(self, using_copy_on_write): ) assert df._is_copy is None + indexer = df.a.str.startswith("o") if using_copy_on_write: - indexer = df.a.str.startswith("o") with tm.raises_chained_assignment_error(): df[indexer]["c"] = 42 + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df[indexer]["c"] = 42 else: with pytest.raises(SettingWithCopyError, match=msg): - indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 @pytest.mark.arm_slow def test_detect_chained_assignment_object_dtype( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) df_original = df.copy() - if not using_copy_on_write: + if not using_copy_on_write and not warn_copy_on_write: with pytest.raises(SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 @@ -312,6 +330,11 @@ def test_detect_chained_assignment_object_dtype( with tm.raises_chained_assignment_error(): df["A"][0] = 111 tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + # TODO(CoW-warn) should give different message + with tm.assert_cow_warning(): + df["A"][0] = 111 + tm.assert_frame_equal(df, expected) elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): df["A"][0] = 111 @@ -367,8 +390,10 @@ def test_detect_chained_assignment_implicit_take(self): df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take2(self, using_copy_on_write): - if using_copy_on_write: + def test_detect_chained_assignment_implicit_take2( + self, using_copy_on_write, warn_copy_on_write + ): + if using_copy_on_write or warn_copy_on_write: pytest.skip("_is_copy is not always set for CoW") # Implicitly take 2 df = random_text(100000) @@ -422,7 +447,9 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): + def test_detect_chained_assignment_undefined_column( + self, using_copy_on_write, warn_copy_on_write + ): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) @@ -433,13 +460,17 @@ def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df.iloc[0:5]["group"] = "a" tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df.iloc[0:5]["group"] = "a" else: with pytest.raises(SettingWithCopyError, match=msg): df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow def test_detect_chained_assignment_changing_dtype( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -460,8 +491,14 @@ def test_detect_chained_assignment_changing_dtype( with tm.raises_chained_assignment_error(extra_warnings=(FutureWarning,)): df["C"][2] = "foo" tm.assert_frame_equal(df, df_original) - - if not using_copy_on_write: + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df.loc[2]["D"] = "foo" + # TODO(CoW-warn) should give different message + with tm.assert_cow_warning(): + df["C"][2] = "foo" + else: with pytest.raises(SettingWithCopyError, match=msg): df.loc[2]["D"] = "foo" @@ -477,7 +514,7 @@ def test_detect_chained_assignment_changing_dtype( df["C"][2] = "foo" assert df.loc[2, "C"] == "foo" - def test_setting_with_copy_bug(self, using_copy_on_write): + def test_setting_with_copy_bug(self, using_copy_on_write, warn_copy_on_write): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} @@ -489,6 +526,10 @@ def test_setting_with_copy_bug(self, using_copy_on_write): with tm.raises_chained_assignment_error(): df[["c"]][mask] = df[["b"]][mask] tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df[["c"]][mask] = df[["b"]][mask] else: with pytest.raises(SettingWithCopyError, match=msg): df[["c"]][mask] = df[["b"]][mask] @@ -502,12 +543,19 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write): + def test_detect_chained_assignment_warnings_errors( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) if using_copy_on_write: with tm.raises_chained_assignment_error(): df.loc[0]["A"] = 111 return + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + df.loc[0]["A"] = 111 + return with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): @@ -519,14 +567,14 @@ def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write): @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) def test_detect_chained_assignment_warning_stacklevel( - self, rhs, using_copy_on_write + self, rhs, using_copy_on_write, warn_copy_on_write ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - if not using_copy_on_write: + if not using_copy_on_write and not warn_copy_on_write: with tm.assert_produces_warning(SettingWithCopyWarning) as t: chained[2] = rhs assert t[0].filename == __file__ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6836e9a7c390e..240ce71c46093 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -12,6 +12,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -1974,12 +1975,14 @@ def test_loc_drops_level(self): class TestLocSetitemWithExpansion: - @pytest.mark.slow - def test_loc_setitem_with_expansion_large_dataframe(self): + def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch): # GH#10692 - result = DataFrame({"x": range(10**6)}, dtype="int64") - result.loc[len(result)] = len(result) + 1 - expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64") + size_cutoff = 50 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + result = DataFrame({"x": range(size_cutoff)}, dtype="int64") + result.loc[size_cutoff] = size_cutoff + expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64") tm.assert_frame_equal(result, expected) def test_loc_setitem_empty_series(self): diff --git a/pandas/tests/io/data/excel/test_cell_annotation.ods b/pandas/tests/io/data/excel/test_cell_annotation.ods new file mode 100644 index 0000000000000..f00a88fe681ef Binary files /dev/null and b/pandas/tests/io/data/excel/test_cell_annotation.ods differ diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 9d49718bec357..f01827fa4ca2f 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -59,3 +59,14 @@ def test_read_unempty_cells(): result = pd.read_excel("test_unempty_cells.ods") tm.assert_frame_equal(result, expected) + + +def test_read_cell_annotation(): + expected = pd.DataFrame( + ["test", np.nan, "test 3"], + columns=["Column 1"], + ) + + result = pd.read_excel("test_cell_annotation.ods") + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 2f18623559557..75e63cb1f6b54 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -7,6 +7,20 @@ class TestEngFormatter: + def test_eng_float_formatter2(self, float_frame): + df = float_frame + df.loc[5] = 0 + + fmt.set_eng_float_format() + repr(df) + + fmt.set_eng_float_format(use_eng_prefix=True) + repr(df) + + fmt.set_eng_float_format(accuracy=0) + repr(df) + tm.reset_display_options() + def test_eng_float_formatter(self): df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 231beb4abd8ba..8901eb99b7612 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,17 +1,12 @@ """ -Test output formatting for Series/DataFrame, including to_string & reprs +Tests for the file pandas.io.formats.format, *not* tests for general formatting +of pandas objects. """ -from datetime import ( - datetime, - timedelta, -) +from datetime import datetime from io import StringIO -import itertools from pathlib import Path import re from shutil import get_terminal_size -import sys -import textwrap import numpy as np import pytest @@ -146,43 +141,6 @@ def has_expanded_repr(df): class TestDataFrameFormatting: - def test_eng_float_formatter(self, float_frame): - df = float_frame - df.loc[5] = 0 - - fmt.set_eng_float_format() - repr(df) - - fmt.set_eng_float_format(use_eng_prefix=True) - repr(df) - - fmt.set_eng_float_format(accuracy=0) - repr(df) - tm.reset_display_options() - - @pytest.mark.parametrize( - "row, columns, show_counts, result", - [ - [20, 20, None, True], - [20, 20, True, True], - [20, 20, False, False], - [5, 5, None, False], - [5, 5, True, False], - [5, 5, False, False], - ], - ) - def test_show_counts(self, row, columns, show_counts, result): - # Explicit cast to float to avoid implicit cast when setting nan - df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"}) - df.iloc[1, 1] = np.nan - - with option_context( - "display.max_info_rows", row, "display.max_info_columns", columns - ): - with StringIO() as buf: - df.info(buf=buf, show_counts=show_counts) - assert ("non-null" in buf.getvalue()) is result - def test_repr_truncation(self): max_len = 20 with option_context("display.max_colwidth", max_len): @@ -539,17 +497,7 @@ def test_auto_detect(self): with option_context("display.max_columns", 0): assert has_horizontally_truncated_repr(df) - def test_to_string_repr_unicode(self): - buf = StringIO() - - unicode_values = ["\u03c3"] * 10 - unicode_values = np.array(unicode_values, dtype=object) - df = DataFrame({"unicode": unicode_values}) - df.to_string(col_space=10, buf=buf) - - # it works! - repr(df) - + def test_to_string_repr_unicode2(self): idx = Index(["abc", "\u03c3a", "aegdvg"]) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) rs = repr(ser).split("\n") @@ -562,14 +510,6 @@ def test_to_string_repr_unicode(self): if not line.startswith("dtype:"): assert len(line) == line_len - # it works even if sys.stdin in None - _stdin = sys.stdin - try: - sys.stdin = None - repr(df) - finally: - sys.stdin = _stdin - def test_east_asian_unicode_false(self): # not aligned properly because of east asian width @@ -919,51 +859,6 @@ def test_to_string_buffer_all_unicode(self): # this should work buf.getvalue() - def test_to_string_with_col_space(self): - df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) - c10 = len(df.to_string(col_space=10).split("\n")[1]) - c20 = len(df.to_string(col_space=20).split("\n")[1]) - c30 = len(df.to_string(col_space=30).split("\n")[1]) - assert c10 < c20 < c30 - - # GH 8230 - # col_space wasn't being applied with header=False - with_header = df.to_string(col_space=20) - with_header_row1 = with_header.splitlines()[1] - no_header = df.to_string(col_space=20, header=False) - assert len(with_header_row1) == len(no_header) - - def test_to_string_with_column_specific_col_space_raises(self): - df = DataFrame( - np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] - ) - - msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" - ) - with pytest.raises(ValueError, match=msg): - df.to_string(col_space=[30, 40]) - - with pytest.raises(ValueError, match=msg): - df.to_string(col_space=[30, 40, 50, 60]) - - msg = "unknown column" - with pytest.raises(ValueError, match=msg): - df.to_string(col_space={"a": "foo", "b": 23, "d": 34}) - - def test_to_string_with_column_specific_col_space(self): - df = DataFrame( - np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] - ) - - result = df.to_string(col_space={"a": 10, "b": 11, "c": 12}) - # 3 separating space + each col_space for (id, a, b, c) - assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) - - result = df.to_string(col_space=[10, 11, 12]) - assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) - @pytest.mark.parametrize( "index", [ @@ -1114,16 +1009,6 @@ def test_datetimeindex_highprecision(self, start_date): result = str(df.index) assert start_date in result - def test_nonunicode_nonascii_alignment(self): - df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) - rep_str = df.to_string() - lines = rep_str.split("\n") - assert len(lines[1]) == len(lines[2]) - - def test_unicode_problem_decoding_as_ascii(self): - dm = DataFrame({"c/\u03c3": Series({"test": np.nan})}) - str(dm.to_string()) - def test_string_repr_encoding(self, datapath): filepath = datapath("io", "parser", "data", "unicode_series.csv") df = read_csv(filepath, header=None, encoding="latin1") @@ -1265,377 +1150,6 @@ def test_long_series(self): nmatches = len(re.findall("dtype", str_rep)) assert nmatches == 1 - def test_index_with_nan(self): - # GH 2850 - df = DataFrame( - { - "id1": {0: "1a3", 1: "9h4"}, - "id2": {0: np.nan, 1: "d67"}, - "id3": {0: "78d", 1: "79d"}, - "value": {0: 123, 1: 64}, - } - ) - - # multi-index - y = df.set_index(["id1", "id2", "id3"]) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "1a3 NaN 78d 123\n9h4 d67 79d 64" - ) - assert result == expected - - # index - y = df.set_index("id2") - result = y.to_string() - expected = ( - " id1 id3 value\nid2 \n" - "NaN 1a3 78d 123\nd67 9h4 79d 64" - ) - assert result == expected - - # with append (this failed in 0.12) - y = df.set_index(["id1", "id2"]).set_index("id3", append=True) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "1a3 NaN 78d 123\n9h4 d67 79d 64" - ) - assert result == expected - - # all-nan in mi - df2 = df.copy() - df2.loc[:, "id2"] = np.nan - y = df2.set_index("id2") - result = y.to_string() - expected = ( - " id1 id3 value\nid2 \n" - "NaN 1a3 78d 123\nNaN 9h4 79d 64" - ) - assert result == expected - - # partial nan in mi - df2 = df.copy() - df2.loc[:, "id2"] = np.nan - y = df2.set_index(["id2", "id3"]) - result = y.to_string() - expected = ( - " id1 value\nid2 id3 \n" - "NaN 78d 1a3 123\n 79d 9h4 64" - ) - assert result == expected - - df = DataFrame( - { - "id1": {0: np.nan, 1: "9h4"}, - "id2": {0: np.nan, 1: "d67"}, - "id3": {0: np.nan, 1: "79d"}, - "value": {0: 123, 1: 64}, - } - ) - - y = df.set_index(["id1", "id2", "id3"]) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "NaN NaN NaN 123\n9h4 d67 79d 64" - ) - assert result == expected - - def test_to_string(self): - # big mixed - biggie = DataFrame( - { - "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), - }, - ) - - biggie.loc[:20, "A"] = np.nan - biggie.loc[:20, "B"] = np.nan - s = biggie.to_string() - - buf = StringIO() - retval = biggie.to_string(buf=buf) - assert retval is None - assert buf.getvalue() == s - - assert isinstance(s, str) - - # print in right order - result = biggie.to_string( - columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ - ) - lines = result.split("\n") - header = lines[0].strip().split() - joined = "\n".join([re.sub(r"\s+", " ", x).strip() for x in lines[1:]]) - recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") - tm.assert_series_equal(recons["B"], biggie["B"]) - assert recons["A"].count() == biggie["A"].count() - assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() - - # expected = ['B', 'A'] - # assert header == expected - - result = biggie.to_string(columns=["A"], col_space=17) - header = result.split("\n")[0].strip().split() - expected = ["A"] - assert header == expected - - biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) - - biggie.to_string(columns=["B", "A"], float_format=str) - biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) - - frame = DataFrame(index=np.arange(200)) - frame.to_string() - - def test_to_string_no_header(self): - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(header=False) - expected = "0 1 4\n1 2 5\n2 3 6" - - assert df_s == expected - - def test_to_string_specified_header(self): - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(header=["X", "Y"]) - expected = " X Y\n0 1 4\n1 2 5\n2 3 6" - - assert df_s == expected - - msg = "Writing 2 cols but got 1 aliases" - with pytest.raises(ValueError, match=msg): - df.to_string(header=["X"]) - - def test_to_string_no_index(self): - # GH 16839, GH 13032 - df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) - - df_s = df.to_string(index=False) - # Leading space is expected for positive numbers. - expected = " x y z\n11 33 AAA\n22 -44 " - assert df_s == expected - - df_s = df[["y", "x", "z"]].to_string(index=False) - expected = " y x z\n 33 11 AAA\n-44 22 " - assert df_s == expected - - def test_to_string_line_width_no_index(self): - # GH 13998, GH 22505 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " - - assert df_s == expected - - def test_to_string_line_width_no_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 " - - assert df_s == expected - - def test_to_string_line_width_no_index_no_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 " - - assert df_s == expected - - def test_to_string_line_width_with_both_index_and_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 " - ) - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 " - ) - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 " - ) - - assert df_s == expected - - def test_to_string_float_formatting(self): - tm.reset_display_options() - with option_context( - "display.precision", - 5, - "display.notebook_repr_html", - False, - ): - df = DataFrame( - {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} - ) - - df_s = df.to_string() - - if _three_digit_exp(): - expected = ( - " x\n0 0.00000e+000\n1 2.50000e-001\n" - "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" - "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" - "8 -1.00000e+006" - ) - else: - expected = ( - " x\n0 0.00000e+00\n1 2.50000e-01\n" - "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" - "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" - "8 -1.00000e+06" - ) - assert df_s == expected - - df = DataFrame({"x": [3234, 0.253]}) - df_s = df.to_string() - - expected = " x\n0 3234.000\n1 0.253" - assert df_s == expected - - tm.reset_display_options() - assert get_option("display.precision") == 6 - - df = DataFrame({"x": [1e9, 0.2512]}) - df_s = df.to_string() - - if _three_digit_exp(): - expected = " x\n0 1.000000e+009\n1 2.512000e-001" - else: - expected = " x\n0 1.000000e+09\n1 2.512000e-01" - assert df_s == expected - - def test_to_string_float_format_no_fixed_width(self): - # GH 21625 - df = DataFrame({"x": [0.19999]}) - expected = " x\n0 0.200" - assert df.to_string(float_format="%.3f") == expected - - # GH 22270 - df = DataFrame({"x": [100.0]}) - expected = " x\n0 100" - assert df.to_string(float_format="%.0f") == expected - - def test_to_string_small_float_values(self): - df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) - - result = df.to_string() - # sadness per above - if _three_digit_exp(): - expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" - ) - else: - expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" - ) - assert result == expected - - # but not all exactly zero - df = df * 0 - result = df.to_string() - expected = " 0\n0 0\n1 0\n2 -0" - - def test_to_string_float_index(self): - index = Index([1.5, 2, 3, 4, 5]) - df = DataFrame(np.arange(5), index=index) - - result = df.to_string() - expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" - assert result == expected - - def test_to_string_complex_float_formatting(self): - # GH #25514, 25745 - with option_context("display.precision", 5): - df = DataFrame( - { - "x": [ - (0.4467846931321966 + 0.0715185102060818j), - (0.2739442392974528 + 0.23515228785438969j), - (0.26974928742135185 + 0.3250604054898979j), - (-1j), - ] - } - ) - result = df.to_string() - expected = ( - " x\n0 0.44678+0.07152j\n" - "1 0.27394+0.23515j\n" - "2 0.26975+0.32506j\n" - "3 -0.00000-1.00000j" - ) - assert result == expected - def test_to_string_ascii_error(self): data = [ ( @@ -1650,139 +1164,6 @@ def test_to_string_ascii_error(self): # it works! repr(df) - def test_to_string_int_formatting(self): - df = DataFrame({"x": [-15, 20, 25, -35]}) - assert issubclass(df["x"].dtype.type, np.integer) - - output = df.to_string() - expected = " x\n0 -15\n1 20\n2 25\n3 -35" - assert output == expected - - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = """\ - 0 1 2 3 4 -a 0 1 2 3 4 -b 5 6 7 8 9 -c 10 11 12 13 14\ -""" - - assert rs == xp - - def test_to_string_left_justify_cols(self): - tm.reset_display_options() - df = DataFrame({"x": [3234, 0.253]}) - df_s = df.to_string(justify="left") - expected = " x \n0 3234.000\n1 0.253" - assert df_s == expected - - def test_to_string_format_na(self): - tm.reset_display_options() - df = DataFrame( - { - "A": [np.nan, -1, -2.1234, 3, 4], - "B": [np.nan, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 NaN NaN\n" - "1 -1.0000 foo\n" - "2 -2.1234 foooo\n" - "3 3.0000 fooooo\n" - "4 4.0000 bar" - ) - assert result == expected - - df = DataFrame( - { - "A": [np.nan, -1.0, -2.0, 3.0, 4.0], - "B": [np.nan, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 NaN NaN\n" - "1 -1.0 foo\n" - "2 -2.0 foooo\n" - "3 3.0 fooooo\n" - "4 4.0 bar" - ) - assert result == expected - - def test_to_string_format_inf(self): - # Issue #24861 - tm.reset_display_options() - df = DataFrame( - { - "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], - "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 -inf -inf\n" - "1 inf inf\n" - "2 -1.0000 foo\n" - "3 -2.1234 foooo\n" - "4 3.0000 fooooo\n" - "5 4.0000 bar" - ) - assert result == expected - - df = DataFrame( - { - "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], - "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 -inf -inf\n" - "1 inf inf\n" - "2 -1.0 foo\n" - "3 -2.0 foooo\n" - "4 3.0 fooooo\n" - "5 4.0 bar" - ) - assert result == expected - - def test_to_string_decimal(self): - # Issue #23614 - df = DataFrame({"A": [6.0, 3.1, 2.2]}) - expected = " A\n0 6,0\n1 3,1\n2 2,2" - assert df.to_string(decimal=",") == expected - - def test_to_string_line_width(self): - df = DataFrame(123, index=range(10, 15), columns=range(30)) - s = df.to_string(line_width=80) - assert max(len(line) for line in s.split("\n")) == 80 - - def test_to_string_header_false(self): - # GH 49230 - df = DataFrame([1, 2]) - df.index.name = "a" - s = df.to_string(header=False) - expected = "a \n0 1\n1 2" - assert s == expected - - df = DataFrame([[1, 2], [3, 4]]) - df.index.name = "a" - s = df.to_string(header=False) - expected = "a \n0 1 2\n1 3 4" - assert s == expected - def test_show_dimensions(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) @@ -1843,145 +1224,6 @@ def test_show_dimensions(self): assert "5 rows" not in str(df) assert "5 rows" not in df._repr_html_() - def test_repr_html(self, float_frame): - df = float_frame - df._repr_html_() - - with option_context("display.max_rows", 1, "display.max_columns", 1): - df._repr_html_() - - with option_context("display.notebook_repr_html", False): - df._repr_html_() - - tm.reset_display_options() - - df = DataFrame([[1, 2], [3, 4]]) - with option_context("display.show_dimensions", True): - assert "2 rows" in df._repr_html_() - with option_context("display.show_dimensions", False): - assert "2 rows" not in df._repr_html_() - - tm.reset_display_options() - - def test_repr_html_mathjax(self): - df = DataFrame([[1, 2], [3, 4]]) - assert "tex2jax_ignore" not in df._repr_html_() - - with option_context("display.html.use_mathjax", False): - assert "tex2jax_ignore" in df._repr_html_() - - def test_repr_html_wide(self): - max_cols = 20 - df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." not in df._repr_html_() - - wide_df = DataFrame([["a" * 25] * (max_cols + 1)] * 10) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in wide_df._repr_html_() - - def test_repr_html_wide_multiindex_cols(self): - max_cols = 20 - - mcols = MultiIndex.from_product( - [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] - ) - df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) - reg_repr = df._repr_html_() - assert "..." not in reg_repr - - mcols = MultiIndex.from_product( - (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] - ) - df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in df._repr_html_() - - def test_repr_html_long(self): - with option_context("display.max_rows", 60): - max_rows = get_option("display.max_rows") - h = max_rows - 1 - df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) - reg_repr = df._repr_html_() - assert ".." not in reg_repr - assert str(41 + max_rows // 2) in reg_repr - - h = max_rows + 1 - df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) - long_repr = df._repr_html_() - assert ".." in long_repr - assert str(41 + max_rows // 2) not in long_repr - assert f"{h} rows " in long_repr - assert "2 columns" in long_repr - - def test_repr_html_float(self): - with option_context("display.max_rows", 60): - max_rows = get_option("display.max_rows") - h = max_rows - 1 - df = DataFrame( - { - "idx": np.linspace(-10, 10, h), - "A": np.arange(1, 1 + h), - "B": np.arange(41, 41 + h), - } - ).set_index("idx") - reg_repr = df._repr_html_() - assert ".." not in reg_repr - assert f"{40 + h}" in reg_repr - - h = max_rows + 1 - df = DataFrame( - { - "idx": np.linspace(-10, 10, h), - "A": np.arange(1, 1 + h), - "B": np.arange(41, 41 + h), - } - ).set_index("idx") - long_repr = df._repr_html_() - assert ".." in long_repr - assert "31" not in long_repr - assert f"{h} rows " in long_repr - assert "2 columns" in long_repr - - def test_repr_html_long_multiindex(self): - max_rows = 60 - max_L1 = max_rows // 2 - - tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) - idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) - df = DataFrame( - np.random.default_rng(2).standard_normal((max_L1 * 2, 2)), - index=idx, - columns=["A", "B"], - ) - with option_context("display.max_rows", 60, "display.max_columns", 20): - reg_repr = df._repr_html_() - assert "..." not in reg_repr - - tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) - idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) - df = DataFrame( - np.random.default_rng(2).standard_normal(((max_L1 + 1) * 2, 2)), - index=idx, - columns=["A", "B"], - ) - long_repr = df._repr_html_() - assert "..." in long_repr - - def test_repr_html_long_and_wide(self): - max_cols = 20 - max_rows = 60 - - h, w = max_rows - 1, max_cols - 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." not in df._repr_html_() - - h, w = max_rows + 1, max_cols + 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in df._repr_html_() - def test_info_repr(self): # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect # the terminal size to ensure that we try to print something "too big" @@ -2028,43 +1270,10 @@ def test_info_repr_max_cols(self): ): assert not has_non_verbose_info_repr(df) + # FIXME: don't leave commented-out # test verbose overrides # set_option('display.max_info_columns', 4) # exceeded - def test_info_repr_html(self): - max_rows = 60 - max_cols = 20 - # Long - h, w = max_rows + 1, max_cols - 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert r"<class" not in df._repr_html_() - with option_context("display.large_repr", "info"): - assert r"<class" in df._repr_html_() - - # Wide - h, w = max_rows - 1, max_cols + 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert " never truncate assert ".." not in repr(s) - def test_to_string_name(self): - s = Series(range(100), dtype="int64") - s.name = "myser" - res = s.to_string(max_rows=2, name=True) - exp = "0 0\n ..\n99 99\nName: myser" - assert res == exp - res = s.to_string(max_rows=2, name=False) - exp = "0 0\n ..\n99 99" - assert res == exp - - def test_to_string_dtype(self): - s = Series(range(100), dtype="int64") - res = s.to_string(max_rows=2, dtype=True) - exp = "0 0\n ..\n99 99\ndtype: int64" - assert res == exp - res = s.to_string(max_rows=2, dtype=False) - exp = "0 0\n ..\n99 99" - assert res == exp - - def test_to_string_length(self): - s = Series(range(100), dtype="int64") - res = s.to_string(max_rows=2, length=True) - exp = "0 0\n ..\n99 99\nLength: 100" - assert res == exp - - def test_to_string_na_rep(self): - s = Series(index=range(100), dtype=np.float64) - res = s.to_string(na_rep="foo", max_rows=2) - exp = "0 foo\n ..\n99 foo" - assert res == exp - - def test_to_string_float_format(self): - s = Series(range(10), dtype="float64") - res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) - exp = "0 0.0\n ..\n9 9.0" - assert res == exp - - def test_to_string_header(self): - s = Series(range(10), dtype="int64") - s.index.name = "foo" - res = s.to_string(header=True, max_rows=2) - exp = "foo\n0 0\n ..\n9 9" - assert res == exp - res = s.to_string(header=False, max_rows=2) - exp = "0 0\n ..\n9 9" - assert res == exp - - def test_to_string_multindex_header(self): - # GH 16718 - df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index(["a", "b"]) - res = df.to_string(header=["r1", "r2"]) - exp = " r1 r2\na b \n0 1 2 3" - assert res == exp - - def test_to_string_empty_col(self): - # GH 13653 - s = Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) - res = s.to_string(index=False) - exp = " \n Hello\n World\n \n \nMooooo\n \n " - assert re.match(exp, res) - class TestGenericArrayFormatter: def test_1d_array(self): @@ -3106,71 +2085,6 @@ def test_too_long(self): assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" -class TestRepr_timedelta64: - def test_none(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base() - assert drepr(delta_1d) == "1 days" - assert drepr(-delta_1d) == "-1 days" - assert drepr(delta_0d) == "0 days" - assert drepr(delta_1s) == "0 days 00:00:01" - assert drepr(delta_500ms) == "0 days 00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_sub_day(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base(format="sub_day") - assert drepr(delta_1d) == "1 days" - assert drepr(-delta_1d) == "-1 days" - assert drepr(delta_0d) == "00:00:00" - assert drepr(delta_1s) == "00:00:01" - assert drepr(delta_500ms) == "00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_long(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base(format="long") - assert drepr(delta_1d) == "1 days 00:00:00" - assert drepr(-delta_1d) == "-1 days +00:00:00" - assert drepr(delta_0d) == "0 days 00:00:00" - assert drepr(delta_1s) == "0 days 00:00:01" - assert drepr(delta_500ms) == "0 days 00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_all(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1ns = pd.to_timedelta(1, unit="ns") - - drepr = lambda x: x._repr_base(format="all") - assert drepr(delta_1d) == "1 days 00:00:00.000000000" - assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" - assert drepr(delta_0d) == "0 days 00:00:00.000000000" - assert drepr(delta_1ns) == "0 days 00:00:00.000000001" - assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001" - - class TestTimedelta64Formatter: def test_days(self): x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values @@ -3307,69 +2221,57 @@ def test_datetime64formatter_tz_ms(self): assert result[1].strip() == "2999-01-02 00:00:00-08:00" -@pytest.mark.parametrize( - "percentiles, expected", - [ - ( - [0.01999, 0.02001, 0.5, 0.666666, 0.9999], - ["1.999%", "2.001%", "50%", "66.667%", "99.99%"], - ), - ( - [0, 0.5, 0.02001, 0.5, 0.666666, 0.9999], - ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"], - ), - ([0.281, 0.29, 0.57, 0.58], ["28.1%", "29%", "57%", "58%"]), - ([0.28, 0.29, 0.57, 0.58], ["28%", "29%", "57%", "58%"]), - ], -) -def test_format_percentiles(percentiles, expected): - result = fmt.format_percentiles(percentiles) - assert result == expected - +class TestFormatPercentiles: + @pytest.mark.parametrize( + "percentiles, expected", + [ + ( + [0.01999, 0.02001, 0.5, 0.666666, 0.9999], + ["1.999%", "2.001%", "50%", "66.667%", "99.99%"], + ), + ( + [0, 0.5, 0.02001, 0.5, 0.666666, 0.9999], + ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"], + ), + ([0.281, 0.29, 0.57, 0.58], ["28.1%", "29%", "57%", "58%"]), + ([0.28, 0.29, 0.57, 0.58], ["28%", "29%", "57%", "58%"]), + ], + ) + def test_format_percentiles(self, percentiles, expected): + result = fmt.format_percentiles(percentiles) + assert result == expected -@pytest.mark.parametrize( - "percentiles", - [([0.1, np.nan, 0.5]), ([-0.001, 0.1, 0.5]), ([2, 0.1, 0.5]), ([0.1, 0.5, "a"])], -) -def test_error_format_percentiles(percentiles): - msg = r"percentiles should all be in the interval \[0,1\]" - with pytest.raises(ValueError, match=msg): - fmt.format_percentiles(percentiles) - - -def test_format_percentiles_integer_idx(): - # Issue #26660 - result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) - expected = [ - "0%", - "10%", - "20%", - "30%", - "40%", - "50%", - "60%", - "70%", - "80%", - "90%", - "100%", - ] - assert result == expected - - -def test_repr_html_ipython_config(ip): - code = textwrap.dedent( - """\ - from pandas import DataFrame - df = DataFrame({"A": [1, 2]}) - df._repr_html_() - - cfg = get_ipython().config - cfg['IPKernelApp']['parent_appname'] - df._repr_html_() - """ + @pytest.mark.parametrize( + "percentiles", + [ + ([0.1, np.nan, 0.5]), + ([-0.001, 0.1, 0.5]), + ([2, 0.1, 0.5]), + ([0.1, 0.5, "a"]), + ], ) - result = ip.run_cell(code, silent=True) - assert not result.error_in_exec + def test_error_format_percentiles(self, percentiles): + msg = r"percentiles should all be in the interval \[0,1\]" + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles(percentiles) + + def test_format_percentiles_integer_idx(self): + # Issue #26660 + result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] + assert result == expected @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) diff --git a/pandas/tests/io/formats/test_ipython_compat.py b/pandas/tests/io/formats/test_ipython_compat.py new file mode 100644 index 0000000000000..8512f41396906 --- /dev/null +++ b/pandas/tests/io/formats/test_ipython_compat.py @@ -0,0 +1,90 @@ +import numpy as np + +import pandas._config.config as cf + +from pandas import ( + DataFrame, + MultiIndex, +) + + +class TestTableSchemaRepr: + def test_publishes(self, ip): + ipython = ip.instance(config=ip.config) + df = DataFrame({"A": [1, 2]}) + objects = [df["A"], df] # dataframe / series + expected_keys = [ + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, + ] + + opt = cf.option_context("display.html.table_schema", True) + last_obj = None + for obj, expected in zip(objects, expected_keys): + last_obj = obj + with opt: + formatted = ipython.display_formatter.format(obj) + assert set(formatted[0].keys()) == expected + + with_latex = cf.option_context("styler.render.repr", "latex") + + with opt, with_latex: + formatted = ipython.display_formatter.format(last_obj) + + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } + assert set(formatted[0].keys()) == expected + + def test_publishes_not_implemented(self, ip): + # column MultiIndex + # GH#15996 + midx = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx + ) + + opt = cf.option_context("display.html.table_schema", True) + + with opt: + formatted = ip.instance(config=ip.config).display_formatter.format(df) + + expected = {"text/plain", "text/html"} + assert set(formatted[0].keys()) == expected + + def test_config_on(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", True): + result = df._repr_data_resource_() + + assert result is not None + + def test_config_default_off(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", False): + result = df._repr_data_resource_() + + assert result is None + + def test_enable_data_resource_formatter(self, ip): + # GH#10491 + formatters = ip.instance(config=ip.config).display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + + # still there, just disabled + assert "application/vnd.dataresource+json" in formatters + assert not formatters[mimetype].enabled + + # able to re-set + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + # smoke test that it works + ip.instance(config=ip.config).display_formatter.format(cf) diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 78198bce71460..e2b65b1fdc40a 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -1,12 +1,9 @@ +# Note! This file is aimed specifically at pandas.io.formats.printing utility +# functions, not the general printing of pandas objects. import string -import numpy as np -import pytest - import pandas._config.config as cf -import pandas as pd - from pandas.io.formats import printing @@ -32,7 +29,7 @@ def test_repr_binary_type(): assert res == b -class TestFormattBase: +class TestFormatBase: def test_adjoin(self): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" @@ -116,132 +113,3 @@ def test_ambiguous_width(self): expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" adjoined = adj.adjoin(2, *data) assert adjoined == expected - - -class TestTableSchemaRepr: - def test_publishes(self, ip): - ipython = ip.instance(config=ip.config) - df = pd.DataFrame({"A": [1, 2]}) - objects = [df["A"], df] # dataframe / series - expected_keys = [ - {"text/plain", "application/vnd.dataresource+json"}, - {"text/plain", "text/html", "application/vnd.dataresource+json"}, - ] - - opt = pd.option_context("display.html.table_schema", True) - last_obj = None - for obj, expected in zip(objects, expected_keys): - last_obj = obj - with opt: - formatted = ipython.display_formatter.format(obj) - assert set(formatted[0].keys()) == expected - - with_latex = pd.option_context("styler.render.repr", "latex") - - with opt, with_latex: - formatted = ipython.display_formatter.format(last_obj) - - expected = { - "text/plain", - "text/html", - "text/latex", - "application/vnd.dataresource+json", - } - assert set(formatted[0].keys()) == expected - - def test_publishes_not_implemented(self, ip): - # column MultiIndex - # GH 15996 - midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) - df = pd.DataFrame( - np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx - ) - - opt = pd.option_context("display.html.table_schema", True) - - with opt: - formatted = ip.instance(config=ip.config).display_formatter.format(df) - - expected = {"text/plain", "text/html"} - assert set(formatted[0].keys()) == expected - - def test_config_on(self): - df = pd.DataFrame({"A": [1, 2]}) - with pd.option_context("display.html.table_schema", True): - result = df._repr_data_resource_() - - assert result is not None - - def test_config_default_off(self): - df = pd.DataFrame({"A": [1, 2]}) - with pd.option_context("display.html.table_schema", False): - result = df._repr_data_resource_() - - assert result is None - - def test_enable_data_resource_formatter(self, ip): - # GH 10491 - formatters = ip.instance(config=ip.config).display_formatter.formatters - mimetype = "application/vnd.dataresource+json" - - with pd.option_context("display.html.table_schema", True): - assert "application/vnd.dataresource+json" in formatters - assert formatters[mimetype].enabled - - # still there, just disabled - assert "application/vnd.dataresource+json" in formatters - assert not formatters[mimetype].enabled - - # able to re-set - with pd.option_context("display.html.table_schema", True): - assert "application/vnd.dataresource+json" in formatters - assert formatters[mimetype].enabled - # smoke test that it works - ip.instance(config=ip.config).display_formatter.format(cf) - - -def test_multiindex_long_element(): - # Non-regression test towards GH #52960 - data = pd.MultiIndex.from_tuples([("c" * 62,)]) - - expected = ( - "MultiIndex([('cccccccccccccccccccccccccccccccccccccccc" - "cccccccccccccccccccccc',)],\n )" - ) - assert str(data) == expected - - -@pytest.mark.parametrize( - "data,output", - [ - ([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]), - ([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), - ([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), - ([-1.23j, complex("nan"), -1], ["-0.00-1.23j", " NaN+0.00j", "-1.00+0.00j"]), - ([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", " NaN+0.00j", " 1.23+0.00j"]), - ( - [-1.23j, complex(np.nan, np.nan), 1], - ["-0.00-1.23j", " NaN+ NaNj", " 1.00+0.00j"], - ), - ( - [-1.23j, complex(1.2, np.nan), 1], - ["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"], - ), - ( - [-1.23j, complex(np.nan, -1.2), 1], - ["-0.00-1.23j", " NaN-1.20j", " 1.00+0.00j"], - ), - ], -) -@pytest.mark.parametrize("as_frame", [True, False]) -def test_ser_df_with_complex_nans(data, output, as_frame): - # GH#53762, GH#53841 - obj = pd.Series(np.array(data)) - if as_frame: - obj = obj.to_frame(name="val") - reprs = [f"{i} {val}" for i, val in enumerate(output)] - expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs) - else: - reprs = [f"{i} {val}" for i, val in enumerate(output)] - expected = "\n".join(reprs) + "\ndtype: complex128" - assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}" diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 38a2bb52930e3..8cfcc26b95b4c 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1,6 +1,8 @@ from datetime import datetime from io import StringIO +import itertools import re +import textwrap import numpy as np import pytest @@ -10,6 +12,7 @@ DataFrame, Index, MultiIndex, + get_option, option_context, ) import pandas._testing as tm @@ -826,43 +829,231 @@ def test_to_html_with_col_space_units(unit): assert expected in h -def test_html_repr_min_rows_default(datapath): - # gh-27991 +class TestReprHTML: + def test_html_repr_min_rows_default(self, datapath): + # gh-27991 - # default setting no truncation even if above min_rows - df = DataFrame({"a": range(20)}) - result = df._repr_html_() - expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") - assert result == expected + # default setting no truncation even if above min_rows + df = DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected - # default of max_rows 60 triggers truncation if above - df = DataFrame({"a": range(61)}) - result = df._repr_html_() - expected = expected_html(datapath, "html_repr_min_rows_default_truncated") - assert result == expected + # default of max_rows 60 triggers truncation if above + df = DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + @pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], + ) + def test_html_repr_min_rows(self, datapath, max_rows, min_rows, expected): + # gh-27991 + + df = DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected + + def test_repr_html_ipython_config(self, ip): + code = textwrap.dedent( + """\ + from pandas import DataFrame + df = DataFrame({"A": [1, 2]}) + df._repr_html_() + + cfg = get_ipython().config + cfg['IPKernelApp']['parent_appname'] + df._repr_html_() + """ + ) + result = ip.run_cell(code, silent=True) + assert not result.error_in_exec -@pytest.mark.parametrize( - "max_rows,min_rows,expected", - [ - # truncated after first two rows - (10, 4, "html_repr_max_rows_10_min_rows_4"), - # when set to None, follow value of max_rows - (12, None, "html_repr_max_rows_12_min_rows_None"), - # when set value higher as max_rows, use the minimum - (10, 12, "html_repr_max_rows_10_min_rows_12"), - # max_rows of None -> never truncate - (None, 12, "html_repr_max_rows_None_min_rows_12"), - ], -) -def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): - # gh-27991 + def test_info_repr_html(self): + max_rows = 60 + max_cols = 20 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() - df = DataFrame({"a": range(61)}) - expected = expected_html(datapath, expected) - with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): - result = df._repr_html_() - assert result == expected + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert "{40 + h}" in reg_repr + + h = max_rows + 1 + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") + long_repr = df._repr_html_() + assert ".." in long_repr + assert "31" not in long_repr + assert f"{h} rows " in long_repr + assert "2 columns" in long_repr + + def test_repr_html_long_multiindex(self): + max_rows = 60 + max_L1 = max_rows // 2 + + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((max_L1 * 2, 2)), + index=idx, + columns=["A", "B"], + ) + with option_context("display.max_rows", 60, "display.max_columns", 20): + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal(((max_L1 + 1) * 2, 2)), + index=idx, + columns=["A", "B"], + ) + long_repr = df._repr_html_() + assert "..." in long_repr + + def test_repr_html_long_and_wide(self): + max_cols = 20 + max_rows = 60 + + h, w = max_rows - 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() + + h, w = max_rows + 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data): diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 45f3b2201a599..3a8dcb339b063 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -1,63 +1,606 @@ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) from io import StringIO +import re +import sys from textwrap import dedent import numpy as np import pytest from pandas import ( + CategoricalIndex, DataFrame, + Index, + NaT, Series, + Timestamp, + concat, + date_range, + get_option, option_context, + read_csv, + timedelta_range, to_datetime, ) import pandas._testing as tm -def test_repr_embedded_ndarray(): - arr = np.empty(10, dtype=[("err", object)]) - for i in range(len(arr)): - arr["err"][i] = np.random.default_rng(2).standard_normal(i) +def _three_digit_exp(): + return f"{1.7e8:.4g}" == "1.7e+008" + + +class TestDataFrameToStringFormatters: + def test_to_string_masked_ea_with_formatter(self): + # GH#39336 + df = DataFrame( + { + "a": Series([0.123456789, 1.123456789], dtype="Float64"), + "b": Series([1, 2], dtype="Int64"), + } + ) + result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) + expected = dedent( + """\ + a b + 0 0.12 1.00 + 1 1.12 2.00""" + ) + assert result == expected + + def test_to_string_with_formatters(self): + df = DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], + }, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), + ] + result = df.to_string(formatters=dict(formatters)) + result2 = df.to_string(formatters=list(zip(*formatters))[1]) + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" + ) + assert result == result2 - df = DataFrame(arr) - repr(df["err"]) - repr(df) - df.to_string() + def test_to_string_with_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({"months": months}) + def format_func(x): + return x.strftime("%Y-%m") -def test_repr_tuples(): - buf = StringIO() + result = x.to_string(formatters={"months": format_func}) + expected = dedent( + """\ + months + 0 2016-01 + 1 2016-02""" + ) + assert result.strip() == expected - df = DataFrame({"tups": list(zip(range(10), range(10)))}) - repr(df) - df.to_string(col_space=10, buf=buf) + def test_to_string_with_datetime64_hourformatter(self): + x = DataFrame( + {"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")} + ) + def format_func(x): + return x.strftime("%H:%M") -def test_to_string_truncate(): - # GH 9784 - dont truncate when calling DataFrame.to_string - df = DataFrame( - [ + result = x.to_string(formatters={"hod": format_func}) + expected = dedent( + """\ + hod + 0 10:10 + 1 12:12""" + ) + assert result.strip() == expected + + def test_to_string_with_formatters_unicode(self): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": str}) + expected = dedent( + """\ + c/\u03c3 + 0 1 + 1 2 + 2 3""" + ) + assert result == expected + + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14\ + """ + ) + assert rs == xp + + def test_no_extra_space(self): + # GH#52690: Check that no extra space is given + col1 = "TEST" + col2 = "PANDAS" + col3 = "to_string" + expected = f"{col1:<6s} {col2:<7s} {col3:<10s}" + df = DataFrame([{"col1": "TEST", "col2": "PANDAS", "col3": "to_string"}]) + d = {"col1": "{:<6s}".format, "col2": "{:<7s}".format, "col3": "{:<10s}".format} + result = df.to_string(index=False, header=False, formatters=d) + assert result == expected + + +class TestDataFrameToStringColSpace: + def test_to_string_with_column_specific_col_space_raises(self): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) + + msg = ( + "Col_space length\\(\\d+\\) should match " + "DataFrame number of columns\\(\\d+\\)" + ) + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40]) + + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40, 50, 60]) + + msg = "unknown column" + with pytest.raises(ValueError, match=msg): + df.to_string(col_space={"a": "foo", "b": 23, "d": 34}) + + def test_to_string_with_column_specific_col_space(self): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) + + result = df.to_string(col_space={"a": 10, "b": 11, "c": 12}) + # 3 separating space + each col_space for (id, a, b, c) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + + result = df.to_string(col_space=[10, 11, 12]) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + + def test_to_string_with_col_space(self): + df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) + c10 = len(df.to_string(col_space=10).split("\n")[1]) + c20 = len(df.to_string(col_space=20).split("\n")[1]) + c30 = len(df.to_string(col_space=30).split("\n")[1]) + assert c10 < c20 < c30 + + # GH#8230 + # col_space wasn't being applied with header=False + with_header = df.to_string(col_space=20) + with_header_row1 = with_header.splitlines()[1] + no_header = df.to_string(col_space=20, header=False) + assert len(with_header_row1) == len(no_header) + + def test_to_string_repr_tuples(self): + buf = StringIO() + + df = DataFrame({"tups": list(zip(range(10), range(10)))}) + repr(df) + df.to_string(col_space=10, buf=buf) + + +class TestDataFrameToStringHeader: + def test_to_string_header_false(self): + # GH#49230 + df = DataFrame([1, 2]) + df.index.name = "a" + s = df.to_string(header=False) + expected = "a \n0 1\n1 2" + assert s == expected + + df = DataFrame([[1, 2], [3, 4]]) + df.index.name = "a" + s = df.to_string(header=False) + expected = "a \n0 1 2\n1 3 4" + assert s == expected + + def test_to_string_multindex_header(self): + # GH#16718 + df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index(["a", "b"]) + res = df.to_string(header=["r1", "r2"]) + exp = " r1 r2\na b \n0 1 2 3" + assert res == exp + + def test_to_string_no_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(header=False) + expected = "0 1 4\n1 2 5\n2 3 6" + + assert df_s == expected + + def test_to_string_specified_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(header=["X", "Y"]) + expected = " X Y\n0 1 4\n1 2 5\n2 3 6" + + assert df_s == expected + + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): + df.to_string(header=["X"]) + + +class TestDataFrameToStringLineWidth: + def test_to_string_line_width(self): + df = DataFrame(123, index=range(10, 15), columns=range(30)) + lines = df.to_string(line_width=80) + assert max(len(line) for line in lines.split("\n")) == 80 + + def test_to_string_line_width_no_index(self): + # GH#13998, GH#22505 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " + + assert df_s == expected + + def test_to_string_line_width_no_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 " + + assert df_s == expected + + def test_to_string_line_width_with_both_index_and_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 " + ) + + assert df_s == expected + + def test_to_string_line_width_no_index_no_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 " + + assert df_s == expected + + +class TestToStringNumericFormatting: + def test_to_string_float_format_no_fixed_width(self): + # GH#21625 + df = DataFrame({"x": [0.19999]}) + expected = " x\n0 0.200" + assert df.to_string(float_format="%.3f") == expected + + # GH#22270 + df = DataFrame({"x": [100.0]}) + expected = " x\n0 100" + assert df.to_string(float_format="%.0f") == expected + + def test_to_string_small_float_values(self): + df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) + + result = df.to_string() + # sadness per above + if _three_digit_exp(): + expected = ( + " a\n" + "0 1.500000e+000\n" + "1 1.000000e-017\n" + "2 -5.500000e-007" + ) + else: + expected = ( + " a\n" + "0 1.500000e+00\n" + "1 1.000000e-17\n" + "2 -5.500000e-07" + ) + assert result == expected + + # but not all exactly zero + df = df * 0 + result = df.to_string() + expected = " 0\n0 0\n1 0\n2 -0" + # TODO: assert that these match?? + + def test_to_string_complex_float_formatting(self): + # GH #25514, 25745 + with option_context("display.precision", 5): + df = DataFrame( + { + "x": [ + (0.4467846931321966 + 0.0715185102060818j), + (0.2739442392974528 + 0.23515228785438969j), + (0.26974928742135185 + 0.3250604054898979j), + (-1j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 0.44678+0.07152j\n" + "1 0.27394+0.23515j\n" + "2 0.26975+0.32506j\n" + "3 -0.00000-1.00000j" + ) + assert result == expected + + def test_to_string_format_inf(self): + # GH#24861 + tm.reset_display_options() + df = DataFrame( { - "a": "foo", - "b": "bar", - "c": "let's make this a very VERY long line that is longer " - "than the default 50 character limit", - "d": 1, - }, - {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, - ] - ) - df.set_index(["a", "b", "c"]) - assert df.to_string() == ( - " a b " - " c d\n" - "0 foo bar let's make this a very VERY long line t" - "hat is longer than the default 50 character limit 1\n" - "1 foo bar " - " stuff 1" - ) - with option_context("max_colwidth", 20): - # the display option has no effect on the to_string method + "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0000 foo\n" + "3 -2.1234 foooo\n" + "4 3.0000 fooooo\n" + "5 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0 foo\n" + "3 -2.0 foooo\n" + "4 3.0 fooooo\n" + "5 4.0 bar" + ) + assert result == expected + + def test_to_string_int_formatting(self): + df = DataFrame({"x": [-15, 20, 25, -35]}) + assert issubclass(df["x"].dtype.type, np.integer) + + output = df.to_string() + expected = " x\n0 -15\n1 20\n2 25\n3 -35" + assert output == expected + + def test_to_string_float_formatting(self): + tm.reset_display_options() + with option_context( + "display.precision", + 5, + "display.notebook_repr_html", + False, + ): + df = DataFrame( + {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} + ) + + df_s = df.to_string() + + if _three_digit_exp(): + expected = ( + " x\n0 0.00000e+000\n1 2.50000e-001\n" + "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" + "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" + "8 -1.00000e+006" + ) + else: + expected = ( + " x\n0 0.00000e+00\n1 2.50000e-01\n" + "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" + "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" + "8 -1.00000e+06" + ) + assert df_s == expected + + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string() + + expected = " x\n0 3234.000\n1 0.253" + assert df_s == expected + + tm.reset_display_options() + assert get_option("display.precision") == 6 + + df = DataFrame({"x": [1e9, 0.2512]}) + df_s = df.to_string() + + if _three_digit_exp(): + expected = " x\n0 1.000000e+009\n1 2.512000e-001" + else: + expected = " x\n0 1.000000e+09\n1 2.512000e-01" + assert df_s == expected + + +class TestDataFrameToString: + def test_to_string_decimal(self): + # GH#23614 + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + expected = " A\n0 6,0\n1 3,1\n2 2,2" + assert df.to_string(decimal=",") == expected + + def test_to_string_left_justify_cols(self): + tm.reset_display_options() + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string(justify="left") + expected = " x \n0 3234.000\n1 0.253" + assert df_s == expected + + def test_to_string_format_na(self): + tm.reset_display_options() + df = DataFrame( + { + "A": [np.nan, -1, -2.1234, 3, 4], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0000 foo\n" + "2 -2.1234 foooo\n" + "3 3.0000 fooooo\n" + "4 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [np.nan, -1.0, -2.0, 3.0, 4.0], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0 foo\n" + "2 -2.0 foooo\n" + "3 3.0 fooooo\n" + "4 4.0 bar" + ) + assert result == expected + + def test_to_string_with_dict_entries(self): + df = DataFrame({"A": [{"a": 1, "b": 2}]}) + + val = df.to_string() + assert "'a': 1" in val + assert "'b': 2" in val + + def test_to_string_with_categorical_columns(self): + # GH#35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = DataFrame(data, columns=cols) + df_cat_cols = DataFrame(data, columns=CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + + def test_repr_embedded_ndarray(self): + arr = np.empty(10, dtype=[("err", object)]) + for i in range(len(arr)): + arr["err"][i] = np.random.default_rng(2).standard_normal(i) + + df = DataFrame(arr) + repr(df["err"]) + repr(df) + df.to_string() + + def test_to_string_truncate(self): + # GH 9784 - dont truncate when calling DataFrame.to_string + df = DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "let's make this a very VERY long line that is longer " + "than the default 50 character limit", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) assert df.to_string() == ( " a b " " c d\n" @@ -66,305 +609,607 @@ def test_to_string_truncate(): "1 foo bar " " stuff 1" ) - assert df.to_string(max_colwidth=20) == ( - " a b c d\n" - "0 foo bar let's make this ... 1\n" - "1 foo bar stuff 1" + with option_context("max_colwidth", 20): + # the display option has no effect on the to_string method + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + assert df.to_string(max_colwidth=20) == ( + " a b c d\n" + "0 foo bar let's make this ... 1\n" + "1 foo bar stuff 1" + ) + + @pytest.mark.parametrize( + "input_array, expected", + [ + ({"A": ["a"]}, "A\na"), + ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), + ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), + ], ) + def test_format_remove_leading_space_dataframe(self, input_array, expected): + # GH#24980 + df = DataFrame(input_array).to_string(index=False) + assert df == expected + @pytest.mark.parametrize( + "data,expected", + [ + ( + {"col1": [1, 2], "col2": [3, 4]}, + " col1 col2\n0 1 3\n1 2 4", + ), + ( + {"col1": ["Abc", 0.756], "col2": [np.nan, 4.5435]}, + " col1 col2\n0 Abc NaN\n1 0.756 4.5435", + ), + ( + {"col1": [np.nan, "a"], "col2": [0.009, 3.543], "col3": ["Abc", 23]}, + " col1 col2 col3\n0 NaN 0.009 Abc\n1 a 3.543 23", + ), + ], + ) + def test_to_string_max_rows_zero(self, data, expected): + # GH#35394 + result = DataFrame(data=data).to_string(max_rows=0) + assert result == expected -@pytest.mark.parametrize( - "input_array, expected", - [ - ("a", "a"), - (["a", "b"], "a\nb"), - ([1, "a"], "1\na"), - (1, "1"), - ([0, -1], " 0\n-1"), - (1.0, "1.0"), - ([" a", " b"], " a\n b"), - ([".1", "1"], ".1\n 1"), - (["10", "-10"], " 10\n-10"), - ], -) -def test_format_remove_leading_space_series(input_array, expected): - # GH: 24980 - s = Series(input_array).to_string(index=False) - assert s == expected - - -@pytest.mark.parametrize( - "input_array, expected", - [ - ({"A": ["a"]}, "A\na"), - ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), - ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), - ], -) -def test_format_remove_leading_space_dataframe(input_array, expected): - # GH: 24980 - df = DataFrame(input_array).to_string(index=False) - assert df == expected - - -@pytest.mark.parametrize( - "max_cols, max_rows, expected", - [ - ( - 10, - None, - " 0 1 2 3 4 ... 6 7 8 9 10\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0", - ), - ( - None, - 2, - " 0 1 2 3 4 5 6 7 8 9 10\n" - " 0 0 0 0 0 0 0 0 0 0 0\n" - " .. .. .. .. .. .. .. .. .. .. ..\n" - " 0 0 0 0 0 0 0 0 0 0 0", - ), - ( - 10, - 2, - " 0 1 2 3 4 ... 6 7 8 9 10\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " .. .. .. .. .. ... .. .. .. .. ..\n" - " 0 0 0 0 0 ... 0 0 0 0 0", - ), - ( - 9, - 2, - " 0 1 2 3 ... 7 8 9 10\n" - " 0 0 0 0 ... 0 0 0 0\n" - " .. .. .. .. ... .. .. .. ..\n" - " 0 0 0 0 ... 0 0 0 0", - ), - ( - 1, - 1, - " 0 ...\n 0 ...\n.. ...", - ), - ], -) -def test_truncation_no_index(max_cols, max_rows, expected): - df = DataFrame([[0] * 11] * 4) - assert df.to_string(index=False, max_cols=max_cols, max_rows=max_rows) == expected + @pytest.mark.parametrize( + "max_cols, max_rows, expected", + [ + ( + 10, + None, + " 0 1 2 3 4 ... 6 7 8 9 10\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0", + ), + ( + None, + 2, + " 0 1 2 3 4 5 6 7 8 9 10\n" + " 0 0 0 0 0 0 0 0 0 0 0\n" + " .. .. .. .. .. .. .. .. .. .. ..\n" + " 0 0 0 0 0 0 0 0 0 0 0", + ), + ( + 10, + 2, + " 0 1 2 3 4 ... 6 7 8 9 10\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " .. .. .. .. .. ... .. .. .. .. ..\n" + " 0 0 0 0 0 ... 0 0 0 0 0", + ), + ( + 9, + 2, + " 0 1 2 3 ... 7 8 9 10\n" + " 0 0 0 0 ... 0 0 0 0\n" + " .. .. .. .. ... .. .. .. ..\n" + " 0 0 0 0 ... 0 0 0 0", + ), + ( + 1, + 1, + " 0 ...\n 0 ...\n.. ...", + ), + ], + ) + def test_truncation_no_index(self, max_cols, max_rows, expected): + df = DataFrame([[0] * 11] * 4) + assert ( + df.to_string(index=False, max_cols=max_cols, max_rows=max_rows) == expected + ) + def test_to_string_no_index(self): + # GH#16839, GH#13032 + df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) -def test_to_string_unicode_columns(float_frame): - df = DataFrame({"\u03c3": np.arange(10.0)}) + df_s = df.to_string(index=False) + # Leading space is expected for positive numbers. + expected = " x y z\n11 33 AAA\n22 -44 " + assert df_s == expected - buf = StringIO() - df.to_string(buf=buf) - buf.getvalue() + df_s = df[["y", "x", "z"]].to_string(index=False) + expected = " y x z\n 33 11 AAA\n-44 22 " + assert df_s == expected - buf = StringIO() - df.info(buf=buf) - buf.getvalue() + def test_to_string_unicode_columns(self, float_frame): + df = DataFrame({"\u03c3": np.arange(10.0)}) - result = float_frame.to_string() - assert isinstance(result, str) + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + buf = StringIO() + df.info(buf=buf) + buf.getvalue() -def test_to_string_utf8_columns(): - n = "\u05d0".encode() + result = float_frame.to_string() + assert isinstance(result, str) - with option_context("display.max_rows", 1): + @pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) + def test_to_string_na_rep_and_float_format(self, na_rep): + # GH#13828 + df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"]) + result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format) + expected = dedent( + f"""\ + Group Data + 0 A 1.22 + 1 A {na_rep}""" + ) + assert result == expected + + def test_to_string_string_dtype(self): + # GH#50099 + pytest.importorskip("pyarrow") + df = DataFrame( + {"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]} + ) + df = df.astype( + {"x": "string[pyarrow]", "y": "string[python]", "z": "int64[pyarrow]"} + ) + result = df.dtypes.to_string() + expected = dedent( + """\ + x string[pyarrow] + y string[python] + z int64[pyarrow]""" + ) + assert result == expected + + def test_to_string_pos_args_deprecation(self): + # GH#54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + "Starting with pandas version 3.0 all arguments of to_string " + "except for the " + "argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = StringIO() + df.to_string(buf, None, None, True, True) + + def test_to_string_utf8_columns(self): + n = "\u05d0".encode() df = DataFrame([1, 2], columns=[n]) + + with option_context("display.max_rows", 1): + repr(df) + + def test_to_string_unicode_two(self): + dm = DataFrame({"c/\u03c3": []}) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_unicode_three(self): + dm = DataFrame(["\xc2"]) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_with_float_index(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.arange(5), index=index) + + result = df.to_string() + expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" + assert result == expected + + def test_to_string(self): + # big mixed + biggie = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(200), + "B": tm.makeStringIndex(200), + }, + ) + + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan + s = biggie.to_string() + + buf = StringIO() + retval = biggie.to_string(buf=buf) + assert retval is None + assert buf.getvalue() == s + + assert isinstance(s, str) + + # print in right order + result = biggie.to_string( + columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ + ) + lines = result.split("\n") + header = lines[0].strip().split() + joined = "\n".join([re.sub(r"\s+", " ", x).strip() for x in lines[1:]]) + recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") + tm.assert_series_equal(recons["B"], biggie["B"]) + assert recons["A"].count() == biggie["A"].count() + assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() + + # FIXME: don't leave commented-out + # expected = ['B', 'A'] + # assert header == expected + + result = biggie.to_string(columns=["A"], col_space=17) + header = result.split("\n")[0].strip().split() + expected = ["A"] + assert header == expected + + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) + + biggie.to_string(columns=["B", "A"], float_format=str) + biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_string() + + # TODO: split or simplify this test? + def test_to_string_index_with_nan(self): + # GH#2850 + df = DataFrame( + { + "id1": {0: "1a3", 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: "78d", 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + # multi-index + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # index + y = df.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nd67 9h4 79d 64" + ) + assert result == expected + + # with append (this failed in 0.12) + y = df.set_index(["id1", "id2"]).set_index("id3", append=True) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # all-nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nNaN 9h4 79d 64" + ) + assert result == expected + + # partial nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index(["id2", "id3"]) + result = y.to_string() + expected = ( + " id1 value\nid2 id3 \n" + "NaN 78d 1a3 123\n 79d 9h4 64" + ) + assert result == expected + + df = DataFrame( + { + "id1": {0: np.nan, 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: np.nan, 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "NaN NaN NaN 123\n9h4 d67 79d 64" + ) + assert result == expected + + def test_to_string_nonunicode_nonascii_alignment(self): + df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) + rep_str = df.to_string() + lines = rep_str.split("\n") + assert len(lines[1]) == len(lines[2]) + + def test_unicode_problem_decoding_as_ascii(self): + df = DataFrame({"c/\u03c3": Series({"test": np.nan})}) + str(df.to_string()) + + def test_to_string_repr_unicode(self): + buf = StringIO() + + unicode_values = ["\u03c3"] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({"unicode": unicode_values}) + df.to_string(col_space=10, buf=buf) + + # it works! repr(df) + # it works even if sys.stdin in None + _stdin = sys.stdin + try: + sys.stdin = None + repr(df) + finally: + sys.stdin = _stdin -def test_to_string_unicode_two(): - dm = DataFrame({"c/\u03c3": []}) - buf = StringIO() - dm.to_string(buf) +class TestSeriesToString: + def test_to_string_without_index(self): + # GH#11729 Test index=False option + ser = Series([1, 2, 3, 4]) + result = ser.to_string(index=False) + expected = "\n".join(["1", "2", "3", "4"]) + assert result == expected + def test_to_string_name(self): + ser = Series(range(100), dtype="int64") + ser.name = "myser" + res = ser.to_string(max_rows=2, name=True) + exp = "0 0\n ..\n99 99\nName: myser" + assert res == exp + res = ser.to_string(max_rows=2, name=False) + exp = "0 0\n ..\n99 99" + assert res == exp -def test_to_string_unicode_three(): - dm = DataFrame(["\xc2"]) - buf = StringIO() - dm.to_string(buf) + def test_to_string_dtype(self): + ser = Series(range(100), dtype="int64") + res = ser.to_string(max_rows=2, dtype=True) + exp = "0 0\n ..\n99 99\ndtype: int64" + assert res == exp + res = ser.to_string(max_rows=2, dtype=False) + exp = "0 0\n ..\n99 99" + assert res == exp + def test_to_string_length(self): + ser = Series(range(100), dtype="int64") + res = ser.to_string(max_rows=2, length=True) + exp = "0 0\n ..\n99 99\nLength: 100" + assert res == exp -def test_to_string_with_formatters(): - df = DataFrame( - { - "int": [1, 2, 3], - "float": [1.0, 2.0, 3.0], - "object": [(1, 2), True, False], - }, - columns=["int", "float", "object"], - ) + def test_to_string_na_rep(self): + ser = Series(index=range(100), dtype=np.float64) + res = ser.to_string(na_rep="foo", max_rows=2) + exp = "0 foo\n ..\n99 foo" + assert res == exp - formatters = [ - ("int", lambda x: f"0x{x:x}"), - ("float", lambda x: f"[{x: 4.1f}]"), - ("object", lambda x: f"-{x!s}-"), - ] - result = df.to_string(formatters=dict(formatters)) - result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == ( - " int float object\n" - "0 0x1 [ 1.0] -(1, 2)-\n" - "1 0x2 [ 2.0] -True-\n" - "2 0x3 [ 3.0] -False-" - ) - assert result == result2 + def test_to_string_float_format(self): + ser = Series(range(10), dtype="float64") + res = ser.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) + exp = "0 0.0\n ..\n9 9.0" + assert res == exp + def test_to_string_header(self): + ser = Series(range(10), dtype="int64") + ser.index.name = "foo" + res = ser.to_string(header=True, max_rows=2) + exp = "foo\n0 0\n ..\n9 9" + assert res == exp + res = ser.to_string(header=False, max_rows=2) + exp = "0 0\n ..\n9 9" + assert res == exp -def test_to_string_with_datetime64_monthformatter(): - months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({"months": months}) + def test_to_string_empty_col(self): + # GH#13653 + ser = Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) + res = ser.to_string(index=False) + exp = " \n Hello\n World\n \n \nMooooo\n \n " + assert re.match(exp, res) - def format_func(x): - return x.strftime("%Y-%m") + def test_to_string_timedelta64(self): + Series(np.array([1100, 20], dtype="timedelta64[ns]")).to_string() - result = x.to_string(formatters={"months": format_func}) - expected = dedent( - """\ - months - 0 2016-01 - 1 2016-02""" - ) - assert result.strip() == expected + ser = Series(date_range("2012-1-1", periods=3, freq="D")) + # GH#2146 -def test_to_string_with_datetime64_hourformatter(): - x = DataFrame( - {"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")} - ) + # adding NaTs + y = ser - ser.shift(1) + result = y.to_string() + assert "1 days" in result + assert "00:00:00" not in result + assert "NaT" in result - def format_func(x): - return x.strftime("%H:%M") + # with frac seconds + o = Series([datetime(2012, 1, 1, microsecond=150)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +23:59:59.999850" in result - result = x.to_string(formatters={"hod": format_func}) - expected = dedent( - """\ - hod - 0 10:10 - 1 12:12""" - ) - assert result.strip() == expected - - -def test_to_string_with_formatters_unicode(): - df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": str}) - expected = dedent( - """\ - c/\u03c3 - 0 1 - 1 2 - 2 3""" - ) - assert result == expected + # rounding? + o = Series([datetime(2012, 1, 1, 1)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +23:00:00" in result + assert "1 days 23:00:00" in result + o = Series([datetime(2012, 1, 1, 1, 1)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +22:59:00" in result + assert "1 days 22:59:00" in result -def test_to_string_complex_number_trims_zeros(): - s = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) - result = s.to_string() - expected = dedent( - """\ - 0 1.00+1.00j - 1 1.00+1.00j - 2 1.05+1.00j""" - ) - assert result == expected - - -def test_nullable_float_to_string(float_ea_dtype): - # https://github.com/pandas-dev/pandas/issues/36775 - dtype = float_ea_dtype - s = Series([0.0, 1.0, None], dtype=dtype) - result = s.to_string() - expected = dedent( - """\ - 0 0.0 - 1 1.0 - 2 """ - ) - assert result == expected - - -def test_nullable_int_to_string(any_int_ea_dtype): - # https://github.com/pandas-dev/pandas/issues/36775 - dtype = any_int_ea_dtype - s = Series([0, 1, None], dtype=dtype) - result = s.to_string() - expected = dedent( - """\ - 0 0 - 1 1 - 2 """ - ) - assert result == expected - - -@pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) -def test_to_string_na_rep_and_float_format(na_rep): - # GH 13828 - df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"]) - result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format) - expected = dedent( - f"""\ - Group Data - 0 A 1.22 - 1 A {na_rep}""" - ) - assert result == expected - - -@pytest.mark.parametrize( - "data,expected", - [ - ( - {"col1": [1, 2], "col2": [3, 4]}, - " col1 col2\n0 1 3\n1 2 4", - ), - ( - {"col1": ["Abc", 0.756], "col2": [np.nan, 4.5435]}, - " col1 col2\n0 Abc NaN\n1 0.756 4.5435", - ), - ( - {"col1": [np.nan, "a"], "col2": [0.009, 3.543], "col3": ["Abc", 23]}, - " col1 col2 col3\n0 NaN 0.009 Abc\n1 a 3.543 23", - ), - ], -) -def test_to_string_max_rows_zero(data, expected): - # GH35394 - result = DataFrame(data=data).to_string(max_rows=0) - assert result == expected - - -def test_to_string_string_dtype(): - # GH#50099 - pytest.importorskip("pyarrow") - df = DataFrame({"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]}) - df = df.astype( - {"x": "string[pyarrow]", "y": "string[python]", "z": "int64[pyarrow]"} - ) - result = df.dtypes.to_string() - expected = dedent( - """\ - x string[pyarrow] - y string[python] - z int64[pyarrow]""" - ) - assert result == expected + o = Series([datetime(2012, 1, 1, 1, 1, microsecond=150)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +22:58:59.999850" in result + assert "0 days 22:58:59.999850" in result + # neg time + td = timedelta(minutes=5, seconds=3) + s2 = Series(date_range("2012-1-1", periods=3, freq="D")) + td + y = ser - s2 + result = y.to_string() + assert "-1 days +23:54:57" in result -def test_to_string_pos_args_deprecation(): - # GH-54229 - df = DataFrame({"a": [1, 2, 3]}) - msg = ( - r"Starting with pandas version 3.0 all arguments of to_string except for the " - r"argument 'buf' will be keyword-only." - ) - with tm.assert_produces_warning(FutureWarning, match=msg): + td = timedelta(microseconds=550) + s2 = Series(date_range("2012-1-1", periods=3, freq="D")) + td + y = ser - td + result = y.to_string() + assert "2012-01-01 23:59:59.999450" in result + + # no boxing of the actual elements + td = Series(timedelta_range("1 days", periods=3)) + result = td.to_string() + assert result == "0 1 days\n1 2 days\n2 3 days" + + def test_to_string(self): + ts = tm.makeTimeSeries() buf = StringIO() - df.to_string(buf, None, None, True, True) + + s = ts.to_string() + + retval = ts.to_string(buf=buf) + assert retval is None + assert buf.getvalue().strip() == s + + # pass float_format + format = "%.4f".__mod__ + result = ts.to_string(float_format=format) + result = [x.split()[1] for x in result.split("\n")[:-1]] + expected = [format(x) for x in ts] + assert result == expected + + # empty string + result = ts[:0].to_string() + assert result == "Series([], Freq: B)" + + result = ts[:0].to_string(length=0) + assert result == "Series([], Freq: B)" + + # name and length + cp = ts.copy() + cp.name = "foo" + result = cp.to_string(length=True, name=True, dtype=True) + last_line = result.split("\n")[-1].strip() + assert last_line == (f"Freq: B, Name: foo, Length: {len(cp)}, dtype: float64") + + @pytest.mark.parametrize( + "input_array, expected", + [ + ("a", "a"), + (["a", "b"], "a\nb"), + ([1, "a"], "1\na"), + (1, "1"), + ([0, -1], " 0\n-1"), + (1.0, "1.0"), + ([" a", " b"], " a\n b"), + ([".1", "1"], ".1\n 1"), + (["10", "-10"], " 10\n-10"), + ], + ) + def test_format_remove_leading_space_series(self, input_array, expected): + # GH: 24980 + ser = Series(input_array) + result = ser.to_string(index=False) + assert result == expected + + def test_to_string_complex_number_trims_zeros(self): + ser = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) + result = ser.to_string() + expected = dedent( + """\ + 0 1.00+1.00j + 1 1.00+1.00j + 2 1.05+1.00j""" + ) + assert result == expected + + def test_nullable_float_to_string(self, float_ea_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = float_ea_dtype + ser = Series([0.0, 1.0, None], dtype=dtype) + result = ser.to_string() + expected = dedent( + """\ + 0 0.0 + 1 1.0 + 2 """ + ) + assert result == expected + + def test_nullable_int_to_string(self, any_int_ea_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = any_int_ea_dtype + ser = Series([0, 1, None], dtype=dtype) + result = ser.to_string() + expected = dedent( + """\ + 0 0 + 1 1 + 2 """ + ) + assert result == expected + + def test_to_string_mixed(self): + ser = Series(["foo", np.nan, -1.23, 4.56]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 NaN\n", "2 -1.23\n", "3 4.56"]) + assert result == expected + + # but don't count NAs as floats + ser = Series(["foo", np.nan, "bar", "baz"]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 NaN\n", "2 bar\n", "3 baz"]) + assert result == expected + + ser = Series(["foo", 5, "bar", "baz"]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 5\n", "2 bar\n", "3 baz"]) + assert result == expected + + def test_to_string_float_na_spacing(self): + ser = Series([0.0, 1.5678, 2.0, -3.0, 4.0]) + ser[::2] = np.nan + + result = ser.to_string() + expected = ( + "0 NaN\n" + "1 1.5678\n" + "2 NaN\n" + "3 -3.0000\n" + "4 NaN" + ) + assert result == expected + + def test_to_string_with_datetimeindex(self): + index = date_range("20130102", periods=6) + ser = Series(1, index=index) + result = ser.to_string() + assert "2013-01-02" in result + + # nat in index + s2 = Series(2, index=[Timestamp("20130111"), NaT]) + ser = concat([s2, ser]) + result = ser.to_string() + assert "NaT" in result + + # nat in summary + result = str(s2.index) + assert "NaT" in result diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 18eee01f87621..a9540c94ce10e 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -147,7 +147,9 @@ def test_unsupported_dtype(c_parser_only, match, kwargs): @td.skip_if_32bit @pytest.mark.slow -def test_precise_conversion(c_parser_only): +# test numbers between 1 and 2 +@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21)) +def test_precise_conversion(c_parser_only, num): parser = c_parser_only normal_errors = [] @@ -156,27 +158,23 @@ def test_precise_conversion(c_parser_only): def error(val: float, actual_val: Decimal) -> Decimal: return abs(Decimal(f"{val:.100}") - actual_val) - # test numbers between 1 and 2 - for num in np.linspace(1.0, 2.0, num=500): - # 25 decimal digits of precision - text = f"a\n{num:.25}" + # 25 decimal digits of precision + text = f"a\n{num:.25}" - normal_val = float( - parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] - ) - precise_val = float( - parser.read_csv(StringIO(text), float_precision="high")["a"][0] - ) - roundtrip_val = float( - parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] - ) - actual_val = Decimal(text[2:]) + normal_val = float( + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] + ) + precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0]) + roundtrip_val = float( + parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] + ) + actual_val = Decimal(text[2:]) - normal_errors.append(error(normal_val, actual_val)) - precise_errors.append(error(precise_val, actual_val)) + normal_errors.append(error(normal_val, actual_val)) + precise_errors.append(error(precise_val, actual_val)) - # round-trip should match float() - assert roundtrip_val == float(text[2:]) + # round-trip should match float() + assert roundtrip_val == float(text[2:]) assert sum(precise_errors) <= sum(normal_errors) assert max(precise_errors) <= max(normal_errors) @@ -287,7 +285,8 @@ def test_tokenize_CR_with_quoting(c_parser_only): @pytest.mark.slow -def test_grow_boundary_at_cap(c_parser_only): +@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)]) +def test_grow_boundary_at_cap(c_parser_only, count): # See gh-12494 # # Cause of error was that the C parser @@ -296,19 +295,18 @@ def test_grow_boundary_at_cap(c_parser_only): # to capacity, which would later cause a # buffer overflow error when checking the # EOF terminator of the CSV stream. + # 3 * 2^n commas was observed to break the parser parser = c_parser_only - def test_empty_header_read(count): - with StringIO("," * count) as s: - expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) - df = parser.read_csv(s) - tm.assert_frame_equal(df, expected) - - for cnt in range(1, 101): - test_empty_header_read(cnt) + with StringIO("," * count) as s: + expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) + df = parser.read_csv(s) + tm.assert_frame_equal(df, expected) -def test_parse_trim_buffers(c_parser_only): +@pytest.mark.slow +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_parse_trim_buffers(c_parser_only, encoding): # This test is part of a bugfix for gh-13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let @@ -319,6 +317,9 @@ def test_parse_trim_buffers(c_parser_only): # times it fails due to memory corruption, which causes the # loaded DataFrame to differ from the expected one. + # Also force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + parser = c_parser_only # Generate a large mixed-type CSV file on-the-fly (one record is @@ -374,25 +375,16 @@ def test_parse_trim_buffers(c_parser_only): ) # Iterate over the CSV file in chunks of `chunksize` lines - with parser.read_csv( - StringIO(csv_data), header=None, dtype=object, chunksize=chunksize - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) - - # Check for data corruption if there was no segfault - tm.assert_frame_equal(result, expected) - - # This extra test was added to replicate the fault in gh-5291. - # Force 'utf-8' encoding, so that `_string_convert` would take - # a different execution branch. with parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize, - encoding="utf_8", + encoding=encoding, ) as chunks_: result = concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 554151841aa22..d546d1275441d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -32,6 +32,7 @@ import pandas._testing as tm from pandas._testing._hypothesis import DATETIME_NO_TZ from pandas.core.indexes.datetimes import date_range +from pandas.core.tools.datetimes import start_caching_at from pandas.io.parsers import read_csv @@ -1285,7 +1286,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers - s = StringIO((f"{value},\n") * 50000) + s = StringIO((f"{value},\n") * (start_caching_at + 1)) warn = None msg = "Passing a BlockManager to DataFrame" diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index b53b1304374d7..302cab8e90db7 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1,6 +1,5 @@ from datetime import datetime from functools import partial -from io import StringIO import numpy as np import pytest @@ -271,34 +270,30 @@ def test_resample_rounding(unit): # GH 8371 # odd results when rounding is needed - data = """date,time,value -11-08-2014,00:00:01.093,1 -11-08-2014,00:00:02.159,1 -11-08-2014,00:00:02.667,1 -11-08-2014,00:00:03.175,1 -11-08-2014,00:00:07.058,1 -11-08-2014,00:00:07.362,1 -11-08-2014,00:00:08.324,1 -11-08-2014,00:00:08.830,1 -11-08-2014,00:00:08.982,1 -11-08-2014,00:00:09.815,1 -11-08-2014,00:00:10.540,1 -11-08-2014,00:00:11.061,1 -11-08-2014,00:00:11.617,1 -11-08-2014,00:00:13.607,1 -11-08-2014,00:00:14.535,1 -11-08-2014,00:00:15.525,1 -11-08-2014,00:00:17.960,1 -11-08-2014,00:00:20.674,1 -11-08-2014,00:00:21.191,1""" - - df = pd.read_csv( - StringIO(data), - parse_dates={"timestamp": ["date", "time"]}, - index_col="timestamp", - ) + ts = [ + "2014-11-08 00:00:01", + "2014-11-08 00:00:02", + "2014-11-08 00:00:02", + "2014-11-08 00:00:03", + "2014-11-08 00:00:07", + "2014-11-08 00:00:07", + "2014-11-08 00:00:08", + "2014-11-08 00:00:08", + "2014-11-08 00:00:08", + "2014-11-08 00:00:09", + "2014-11-08 00:00:10", + "2014-11-08 00:00:11", + "2014-11-08 00:00:11", + "2014-11-08 00:00:13", + "2014-11-08 00:00:14", + "2014-11-08 00:00:15", + "2014-11-08 00:00:17", + "2014-11-08 00:00:20", + "2014-11-08 00:00:21", + ] + df = DataFrame({"value": [1] * 19}, index=pd.to_datetime(ts)) df.index = df.index.as_unit(unit) - df.index.name = None + result = df.resample("6s").sum() expected = DataFrame( {"value": [4, 9, 4, 2]}, @@ -516,7 +511,7 @@ def test_upsample_with_limit(unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"]) +@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"]) @pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71606fb72c0f6..fe8e243919d05 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -19,6 +19,22 @@ ) import pandas._testing as tm +UNITS = ["s", "ms", "us", "ns"] + + +@pytest.fixture(params=UNITS) +def unit(request): + return request.param + + +unit2 = unit + + +def _get_finer_unit(unit, unit2): + if UNITS.index(unit) >= UNITS.index(unit2): + return unit + return unit2 + class TestDatetimeConcat: def test_concat_datetime64_block(self): @@ -307,50 +323,58 @@ def test_concat_tz_series2(self): result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - def test_concat_tz_series3(self): + def test_concat_tz_series3(self, unit, unit2): # see gh-12217 and gh-12306 # Concatenating two UTC times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("UTC") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("UTC") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, UTC]" + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, UTC]" - def test_concat_tz_series4(self): + def test_concat_tz_series4(self, unit, unit2): # Concatenating two London times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" - def test_concat_tz_series5(self): + def test_concat_tz_series5(self, unit, unit2): # Concatenating 2+1 London times - first = DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first = DataFrame( + [[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]" + ) first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 3)]]) + second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" - def test_concat_tz_series6(self): - # Concat'ing 1+2 London times - first = DataFrame([[datetime(2016, 1, 1)]]) + def test_concat_tz_series6(self, unit, unit2): + # Concatenating 1+2 London times + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second = DataFrame( + [[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]" + ) second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = _get_finer_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" def test_concat_tz_series_tzlocal(self): # see gh-13583 diff --git a/pandas/tests/reshape/merge/data/allow_exact_matches.csv b/pandas/tests/reshape/merge/data/allow_exact_matches.csv deleted file mode 100644 index 0446fb744c540..0000000000000 --- a/pandas/tests/reshape/merge/data/allow_exact_matches.csv +++ /dev/null @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv b/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv deleted file mode 100644 index 0446fb744c540..0000000000000 --- a/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv +++ /dev/null @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tests/reshape/merge/data/asof.csv b/pandas/tests/reshape/merge/data/asof.csv deleted file mode 100644 index d7d061bc46ccc..0000000000000 --- a/pandas/tests/reshape/merge/data/asof.csv +++ /dev/null @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tests/reshape/merge/data/asof2.csv b/pandas/tests/reshape/merge/data/asof2.csv deleted file mode 100644 index 2c9c0392dd617..0000000000000 --- a/pandas/tests/reshape/merge/data/asof2.csv +++ /dev/null @@ -1,78 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.084,AAPL,98.64,40,NASDAQ,98.55,98.56 -20160525 13:30:00.084,AAPL,98.55,149,EDGX,98.55,98.56 -20160525 13:30:00.086,AAPL,98.56,500,ARCA,98.55,98.63 -20160525 13:30:00.104,AAPL,98.63,647,EDGX,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,300,EDGX,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,1,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,62,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,10,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.105,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.105,AAPL,98.63,700,ARCA,98.62,98.63 -20160525 13:30:00.106,AAPL,98.63,61,EDGX,98.62,98.63 -20160525 13:30:00.107,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.107,AAPL,98.63,53,ARCA,98.62,98.63 -20160525 13:30:00.108,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.108,AAPL,98.63,839,ARCA,98.62,98.63 -20160525 13:30:00.115,AAPL,98.63,5,EDGX,98.62,98.63 -20160525 13:30:00.118,AAPL,98.63,295,EDGX,98.62,98.63 -20160525 13:30:00.118,AAPL,98.63,5,EDGX,98.62,98.63 -20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 -20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 -20160525 13:30:00.128,MSFT,51.92,100,ARCA,51.92,51.95 -20160525 13:30:00.129,AAPL,98.62,100,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,10,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,59,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,31,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,69,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,12,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,12,EDGX,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 -20160525 13:30:00.130,MSFT,51.95,317,ARCA,51.93,51.95 -20160525 13:30:00.130,MSFT,51.95,283,ARCA,51.93,51.95 -20160525 13:30:00.135,MSFT,51.93,100,EDGX,51.92,51.95 -20160525 13:30:00.135,AAPL,98.62,100,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,12,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,88,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,162,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,100,BATS,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,61,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,25,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,14,ARCA,98.61,98.62 -20160525 13:30:00.145,AAPL,98.62,12,ARCA,98.6,98.63 -20160525 13:30:00.145,AAPL,98.62,100,ARCA,98.6,98.63 -20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 -20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 diff --git a/pandas/tests/reshape/merge/data/quotes.csv b/pandas/tests/reshape/merge/data/quotes.csv deleted file mode 100644 index 3f31d2cfffe1b..0000000000000 --- a/pandas/tests/reshape/merge/data/quotes.csv +++ /dev/null @@ -1,17 +0,0 @@ -time,ticker,bid,ask -20160525 13:30:00.023,GOOG,720.50,720.93 -20160525 13:30:00.023,MSFT,51.95,51.95 -20160525 13:30:00.041,MSFT,51.95,51.95 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.072,GOOG,720.50,720.88 -20160525 13:30:00.075,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.92,51.95 diff --git a/pandas/tests/reshape/merge/data/quotes2.csv b/pandas/tests/reshape/merge/data/quotes2.csv deleted file mode 100644 index 7ade1e7faf1ae..0000000000000 --- a/pandas/tests/reshape/merge/data/quotes2.csv +++ /dev/null @@ -1,57 +0,0 @@ -time,ticker,bid,ask -20160525 13:30:00.023,GOOG,720.50,720.93 -20160525 13:30:00.023,MSFT,51.95,51.95 -20160525 13:30:00.041,MSFT,51.95,51.95 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.072,GOOG,720.50,720.88 -20160525 13:30:00.075,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.92,51.95 -20160525 13:30:00.079,MSFT,51.92,51.95 -20160525 13:30:00.080,AAPL,98.55,98.56 -20160525 13:30:00.084,AAPL,98.55,98.56 -20160525 13:30:00.086,AAPL,98.55,98.63 -20160525 13:30:00.088,AAPL,98.65,98.63 -20160525 13:30:00.089,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.62,98.63 -20160525 13:30:00.105,AAPL,98.62,98.63 -20160525 13:30:00.107,AAPL,98.62,98.63 -20160525 13:30:00.115,AAPL,98.62,98.63 -20160525 13:30:00.115,AAPL,98.62,98.63 -20160525 13:30:00.118,AAPL,98.62,98.63 -20160525 13:30:00.128,AAPL,98.62,98.63 -20160525 13:30:00.128,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.61,98.63 -20160525 13:30:00.130,MSFT,51.93,51.95 -20160525 13:30:00.130,MSFT,51.93,51.95 -20160525 13:30:00.130,AAPL,98.61,98.63 -20160525 13:30:00.131,AAPL,98.61,98.62 -20160525 13:30:00.131,AAPL,98.61,98.62 -20160525 13:30:00.135,MSFT,51.92,51.95 -20160525 13:30:00.135,AAPL,98.61,98.62 -20160525 13:30:00.136,AAPL,98.61,98.62 -20160525 13:30:00.136,AAPL,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,98.62 -20160525 13:30:00.145,AAPL,98.61,98.62 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.60,98.63 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.60,98.63 diff --git a/pandas/tests/reshape/merge/data/tolerance.csv b/pandas/tests/reshape/merge/data/tolerance.csv deleted file mode 100644 index d7d061bc46ccc..0000000000000 --- a/pandas/tests/reshape/merge/data/tolerance.csv +++ /dev/null @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tests/reshape/merge/data/trades.csv b/pandas/tests/reshape/merge/data/trades.csv deleted file mode 100644 index b26a4ce714255..0000000000000 --- a/pandas/tests/reshape/merge/data/trades.csv +++ /dev/null @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter -20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ -20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ -20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ -20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.076,AAPL,98.5600,1000,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.076,AAPL,98.5600,300,ARCA -20160525 13:30:00.076,AAPL,98.5600,400,ARCA -20160525 13:30:00.076,AAPL,98.5600,600,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ diff --git a/pandas/tests/reshape/merge/data/trades2.csv b/pandas/tests/reshape/merge/data/trades2.csv deleted file mode 100644 index 64021faa68ce3..0000000000000 --- a/pandas/tests/reshape/merge/data/trades2.csv +++ /dev/null @@ -1,78 +0,0 @@ -time,ticker,price,quantity,marketCenter -20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ -20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ -20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ -20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.076,AAPL,98.5600,1000,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.076,AAPL,98.5600,300,ARCA -20160525 13:30:00.076,AAPL,98.5600,400,ARCA -20160525 13:30:00.076,AAPL,98.5600,600,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.084,AAPL,98.6400,40,NASDAQ -20160525 13:30:00.084,AAPL,98.5500,149,EDGX -20160525 13:30:00.086,AAPL,98.5600,500,ARCA -20160525 13:30:00.104,AAPL,98.6300,647,EDGX -20160525 13:30:00.104,AAPL,98.6300,300,EDGX -20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,1,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,62,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,10,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,100,ARCA -20160525 13:30:00.105,AAPL,98.6300,100,ARCA -20160525 13:30:00.105,AAPL,98.6300,700,ARCA -20160525 13:30:00.106,AAPL,98.6300,61,EDGX -20160525 13:30:00.107,AAPL,98.6300,100,ARCA -20160525 13:30:00.107,AAPL,98.6300,53,ARCA -20160525 13:30:00.108,AAPL,98.6300,100,ARCA -20160525 13:30:00.108,AAPL,98.6300,839,ARCA -20160525 13:30:00.115,AAPL,98.6300,5,EDGX -20160525 13:30:00.118,AAPL,98.6300,295,EDGX -20160525 13:30:00.118,AAPL,98.6300,5,EDGX -20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.128,MSFT,51.9200,100,ARCA -20160525 13:30:00.129,AAPL,98.6200,100,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,10,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,59,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,31,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,69,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,12,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,12,EDGX -20160525 13:30:00.129,AAPL,98.6200,100,ARCA -20160525 13:30:00.129,AAPL,98.6200,100,ARCA -20160525 13:30:00.130,MSFT,51.9500,317,ARCA -20160525 13:30:00.130,MSFT,51.9500,283,ARCA -20160525 13:30:00.135,MSFT,51.9300,100,EDGX -20160525 13:30:00.135,AAPL,98.6200,100,ARCA -20160525 13:30:00.144,AAPL,98.6200,12,NASDAQ -20160525 13:30:00.144,AAPL,98.6200,88,NASDAQ -20160525 13:30:00.144,AAPL,98.6200,162,NASDAQ -20160525 13:30:00.144,AAPL,98.6100,100,BATS -20160525 13:30:00.144,AAPL,98.6200,61,ARCA -20160525 13:30:00.144,AAPL,98.6200,25,ARCA -20160525 13:30:00.144,AAPL,98.6200,14,ARCA -20160525 13:30:00.145,AAPL,98.6200,12,ARCA -20160525 13:30:00.145,AAPL,98.6200,100,ARCA -20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 50ead8747fa2f..d7aed23d63cfb 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -11,7 +11,6 @@ Index, Timedelta, merge_asof, - read_csv, to_datetime, ) import pandas._testing as tm @@ -27,39 +26,1070 @@ def unit(request): class TestAsOfMerge: - def read_data(self, datapath, name, dedupe=False): - path = datapath("reshape", "merge", "data", name) - x = read_csv(path) + def prep_data(self, df, dedupe=False): if dedupe: - x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index( + df = df.drop_duplicates(["time", "ticker"], keep="last").reset_index( drop=True ) - x.time = to_datetime(x.time) - return x + df.time = to_datetime(df.time) + return df @pytest.fixture - def trades(self, datapath): - return self.read_data(datapath, "trades.csv") + def trades(self): + df = pd.DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.9500", "75", "NASDAQ"], + ["20160525 13:30:00.038", "MSFT", "51.9500", "155", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.7700", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9200", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "200", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "300", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "600", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "44", "NASDAQ"], + ["20160525 13:30:00.074", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6600", "6", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "30", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "75", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "20", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "35", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "10", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "1000", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "300", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "400", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "600", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "783", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ], + columns="time,ticker,price,quantity,marketCenter".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + return self.prep_data(df) @pytest.fixture - def quotes(self, datapath): - return self.read_data(datapath, "quotes.csv", dedupe=True) + def quotes(self): + df = pd.DataFrame( + [ + ["20160525 13:30:00.023", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.041", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.072", "GOOG", "720.50", "720.88"], + ["20160525 13:30:00.075", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.92", "51.95"], + ], + columns="time,ticker,bid,ask".split(","), + ) + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df, dedupe=True) @pytest.fixture - def asof(self, datapath): - return self.read_data(datapath, "asof.csv") + def asof(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture - def tolerance(self, datapath): - return self.read_data(datapath, "tolerance.csv") + def tolerance(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture def allow_exact_matches(self, datapath): - return self.read_data(datapath, "allow_exact_matches.csv") + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture - def allow_exact_matches_and_tolerance(self, datapath): - return self.read_data(datapath, "allow_exact_matches_and_tolerance.csv") + def allow_exact_matches_and_tolerance(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) def test_examples1(self): """doc-string examples""" @@ -501,9 +1531,860 @@ def test_multiby_indexed(self): ) def test_basic2(self, datapath): - expected = self.read_data(datapath, "asof2.csv") - trades = self.read_data(datapath, "trades2.csv") - quotes = self.read_data(datapath, "quotes2.csv", dedupe=True) + expected = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.084", + "AAPL", + "98.64", + "40", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.084", + "AAPL", + "98.55", + "149", + "EDGX", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.086", + "AAPL", + "98.56", + "500", + "ARCA", + "98.55", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "647", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "300", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "50", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "50", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "70", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "70", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "1", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "62", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "10", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.105", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.105", + "AAPL", + "98.63", + "700", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.106", + "AAPL", + "98.63", + "61", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.107", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.107", + "AAPL", + "98.63", + "53", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.108", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.108", + "AAPL", + "98.63", + "839", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.115", + "AAPL", + "98.63", + "5", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.118", + "AAPL", + "98.63", + "295", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.118", + "AAPL", + "98.63", + "5", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "MSFT", + "51.92", + "100", + "ARCA", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "10", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "59", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "31", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "69", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "12", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "12", + "EDGX", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.130", + "MSFT", + "51.95", + "317", + "ARCA", + "51.93", + "51.95", + ], + [ + "20160525 13:30:00.130", + "MSFT", + "51.95", + "283", + "ARCA", + "51.93", + "51.95", + ], + [ + "20160525 13:30:00.135", + "MSFT", + "51.93", + "100", + "EDGX", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.135", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "12", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "88", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "162", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.61", + "100", + "BATS", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "61", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "25", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "14", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.62", + "12", + "ARCA", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.62", + "100", + "ARCA", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.6", + "98.63", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + expected["price"] = expected["price"].astype("float64") + expected["quantity"] = expected["quantity"].astype("int64") + expected["bid"] = expected["bid"].astype("float64") + expected["ask"] = expected["ask"].astype("float64") + expected = self.prep_data(expected) + + trades = pd.DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.9500", "75", "NASDAQ"], + ["20160525 13:30:00.038", "MSFT", "51.9500", "155", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.7700", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9200", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "200", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "300", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "600", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "44", "NASDAQ"], + ["20160525 13:30:00.074", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6600", "6", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "30", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "75", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "20", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "35", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "10", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "1000", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "300", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "400", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "600", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "783", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.084", "AAPL", "98.6400", "40", "NASDAQ"], + ["20160525 13:30:00.084", "AAPL", "98.5500", "149", "EDGX"], + ["20160525 13:30:00.086", "AAPL", "98.5600", "500", "ARCA"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "647", "EDGX"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "300", "EDGX"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "50", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "50", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "70", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "70", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "1", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "62", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "10", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.105", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.105", "AAPL", "98.6300", "700", "ARCA"], + ["20160525 13:30:00.106", "AAPL", "98.6300", "61", "EDGX"], + ["20160525 13:30:00.107", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.107", "AAPL", "98.6300", "53", "ARCA"], + ["20160525 13:30:00.108", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.108", "AAPL", "98.6300", "839", "ARCA"], + ["20160525 13:30:00.115", "AAPL", "98.6300", "5", "EDGX"], + ["20160525 13:30:00.118", "AAPL", "98.6300", "295", "EDGX"], + ["20160525 13:30:00.118", "AAPL", "98.6300", "5", "EDGX"], + ["20160525 13:30:00.128", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.128", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.128", "MSFT", "51.9200", "100", "ARCA"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "10", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "59", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "31", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "69", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "12", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "12", "EDGX"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.130", "MSFT", "51.9500", "317", "ARCA"], + ["20160525 13:30:00.130", "MSFT", "51.9500", "283", "ARCA"], + ["20160525 13:30:00.135", "MSFT", "51.9300", "100", "EDGX"], + ["20160525 13:30:00.135", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "12", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "88", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "162", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6100", "100", "BATS"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "61", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "25", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "14", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6200", "12", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.145", "AAPL", "98.6300", "100", "NASDAQ"], + ], + columns="time,ticker,price,quantity,marketCenter".split(","), + ) + trades["price"] = trades["price"].astype("float64") + trades["quantity"] = trades["quantity"].astype("int64") + trades = self.prep_data(trades) + + quotes = pd.DataFrame( + [ + ["20160525 13:30:00.023", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.041", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.072", "GOOG", "720.50", "720.88"], + ["20160525 13:30:00.075", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.079", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.080", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.084", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.086", "AAPL", "98.55", "98.63"], + ["20160525 13:30:00.088", "AAPL", "98.65", "98.63"], + ["20160525 13:30:00.089", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.105", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.107", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.115", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.115", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.118", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.128", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.128", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.130", "MSFT", "51.93", "51.95"], + ["20160525 13:30:00.130", "MSFT", "51.93", "51.95"], + ["20160525 13:30:00.130", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.136", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.136", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.144", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.144", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.60", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.60", "98.63"], + ], + columns="time,ticker,bid,ask".split(","), + ) + quotes["bid"] = quotes["bid"].astype("float64") + quotes["ask"] = quotes["ask"].astype("float64") + quotes = self.prep_data(quotes, dedupe=True) result = merge_asof(trades, quotes, on="time", by="ticker") tm.assert_frame_equal(result, expected) @@ -535,14 +2416,14 @@ def test_valid_join_keys(self, trades, quotes): with pytest.raises(MergeError, match="can only asof on a key for left"): merge_asof(trades, quotes, by="ticker") - def test_with_duplicates(self, datapath, trades, quotes): + def test_with_duplicates(self, datapath, trades, quotes, asof): q = ( pd.concat([quotes, quotes]) .sort_values(["time", "ticker"]) .reset_index(drop=True) ) result = merge_asof(trades, q, on="time", by="ticker") - expected = self.read_data(datapath, "asof.csv") + expected = self.prep_data(asof) tm.assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index 753186ee4b738..e1b0076d5b7b9 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -42,3 +42,68 @@ def test_repr(td, expected_repr): ) def test_isoformat(td, expected_iso): assert td.isoformat() == expected_iso + + +class TestReprBase: + def test_none(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base() + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "0 days" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_sub_day(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="sub_day") + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "00:00:00" + assert drepr(delta_1s) == "00:00:01" + assert drepr(delta_500ms) == "00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_long(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="long") + assert drepr(delta_1d) == "1 days 00:00:00" + assert drepr(-delta_1d) == "-1 days +00:00:00" + assert drepr(delta_0d) == "0 days 00:00:00" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_all(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1ns = Timedelta(1, unit="ns") + + drepr = lambda x: x._repr_base(format="all") + assert drepr(delta_1d) == "1 days 00:00:00.000000000" + assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" + assert drepr(delta_0d) == "0 days 00:00:00.000000000" + assert drepr(delta_1ns) == "0 days 00:00:00.000000001" + assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001" diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index d7d33ae058af8..53ac7fbf40af1 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -281,7 +281,7 @@ def test_dt_accessor_ambiguous_freq_conversions(self): expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self, using_copy_on_write): + def test_dt_accessor_not_writeable(self, using_copy_on_write, warn_copy_on_write): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): @@ -293,6 +293,10 @@ def test_dt_accessor_not_writeable(self, using_copy_on_write): if using_copy_on_write: with tm.raises_chained_assignment_error(): ser.dt.hour[0] = 5 + elif warn_copy_on_write: + # TODO(CoW-warn) should warn + with tm.assert_cow_warning(False): + ser.dt.hour[0] = 5 else: with pytest.raises(SettingWithCopyError, match=msg): ser.dt.hour[0] = 5 diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index faa3978038dd5..edd3062f7d472 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -107,6 +107,20 @@ def test_astype_dict_like(self, dtype_class): class TestAstype: + def test_astype_object_to_dt64_non_nano(self): + # GH#55756 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + # NB: the 2500 is interpreted as nanoseconds and rounded *down* + # to 2 microseconds + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + ser = Series(vals, dtype=object) + result = ser.astype(dtype) + + exp_arr = np.array([ts.asm8, vals[1], 2], dtype=dtype) + expected = Series(exp_arr, dtype=dtype) + tm.assert_series_equal(result, expected) + def test_astype_mixed_object_to_dt64tz(self): # pre-2.0 this raised ValueError bc of tz mismatch # xref GH#32581 diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 77600e0e7d293..ea439fb5a3263 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -38,6 +38,7 @@ def test_copy(self, deep, using_copy_on_write): assert np.isnan(ser2[0]) assert np.isnan(ser[0]) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("deep", ["default", None, False, True]) def test_copy_tzaware(self, deep, using_copy_on_write): # GH#11794 diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 46bc14da59eb0..5d0ef893d5723 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -248,7 +248,7 @@ def test_timedelta_fillna(self, frame_or_series): ] ) td = ser.diff() - obj = frame_or_series(td) + obj = frame_or_series(td).copy() # reg fillna result = obj.fillna(Timedelta(seconds=0)) @@ -321,7 +321,7 @@ def test_timedelta_fillna(self, frame_or_series): # ffill td[2] = np.nan - obj = frame_or_series(td) + obj = frame_or_series(td).copy() result = obj.ffill() expected = td.fillna(Timedelta(seconds=0)) expected[0] = np.nan diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/series/methods/test_info.py similarity index 100% rename from pandas/tests/io/formats/test_series_info.py rename to pandas/tests/series/methods/test_info.py diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 4ea2fbf51afc9..f94f67b8cc40a 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -7,6 +7,7 @@ date_range, ) import pandas._testing as tm +from pandas.core import algorithms from pandas.core.arrays import PeriodArray @@ -197,13 +198,16 @@ def test_isin_masked_types(self, dtype, data, values, expected): tm.assert_series_equal(result, expected) -@pytest.mark.slow -def test_isin_large_series_mixed_dtypes_and_nan(): +def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): # https://github.com/pandas-dev/pandas/issues/37094 - # combination of object dtype for the values and > 1_000_000 elements - ser = Series([1, 2, np.nan] * 1_000_000) - result = ser.isin({"foo", "bar"}) - expected = Series([False] * 3 * 1_000_000) + # combination of object dtype for the values + # and > _MINIMUM_COMP_ARR_LEN elements + min_isin_comp = 5 + ser = Series([1, 2, np.nan] * min_isin_comp) + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin({"foo", "bar"}) + expected = Series([False] * 3 * min_isin_comp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 93c4fbb7f3c46..83adff08b758e 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -162,12 +162,13 @@ def test_rename_error_arg(self): with pytest.raises(KeyError, match=match): ser.rename({2: 9}, errors="raise") - def test_rename_copy_false(self, using_copy_on_write): + def test_rename_copy_false(self, using_copy_on_write, warn_copy_on_write): # GH 46889 ser = Series(["foo", "bar"]) ser_orig = ser.copy() shallow_copy = ser.rename({1: 9}, copy=False) - ser[0] = "foobar" + with tm.assert_cow_warning(warn_copy_on_write): + ser[0] = "foobar" if using_copy_on_write: assert ser_orig[0] == shallow_copy[0] assert ser_orig[1] == shallow_copy[9] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 75eabe4ea0308..8d9b46508a25f 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -887,12 +887,14 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) - def test_constructor_dtype_no_cast(self, using_copy_on_write): + def test_constructor_dtype_no_cast(self, using_copy_on_write, warn_copy_on_write): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) - s2[1] = 5 + warn = FutureWarning if warn_copy_on_write else None + with tm.assert_produces_warning(warn): + s2[1] = 5 if using_copy_on_write: assert s[1] == 2 else: diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0615bd37f10e9..0cc2a125c1e26 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -230,11 +230,11 @@ def mark_array_manager_not_yet_implemented(request) -> None: ) skip_copy_on_write_not_yet_implemented = pytest.mark.xfail( - get_option("mode.copy_on_write"), + get_option("mode.copy_on_write") is True, reason="Not yet implemented/adapted for Copy-on-Write mode", ) skip_copy_on_write_invalid_test = pytest.mark.skipif( - get_option("mode.copy_on_write"), + get_option("mode.copy_on_write") is True, reason="Test not valid for Copy-on-Write mode", )