From a9ba5de85795eb11d981342ab9b19f50e5f9cca7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 15:44:51 +0100 Subject: [PATCH 001/115] `period_format` now has a faster default formatter leveraging python string formatting --- pandas/_libs/tslibs/period.pyx | 113 +++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index be6f87791284e..63bdc763d1a5f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1148,46 +1148,89 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef str period_format(int64_t value, int freq, object fmt=None): - cdef: - int freq_group - if value == NPY_NAT: return "NaT" - if isinstance(fmt, str): - # Encode using current locale, in case fmt contains non-utf8 chars - fmt = util.string_encode_locale(fmt) - if fmt is None: - freq_group = get_freq_group(freq) - if freq_group == FR_ANN: - fmt = b'%Y' - elif freq_group == FR_QTR: - fmt = b'%FQ%q' - elif freq_group == FR_MTH: - fmt = b'%Y-%m' - elif freq_group == FR_WK: - left = period_asfreq(value, freq, FR_DAY, 0) - right = period_asfreq(value, freq, FR_DAY, 1) - return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" - elif freq_group == FR_BUS or freq_group == FR_DAY: - fmt = b'%Y-%m-%d' - elif freq_group == FR_HR: - fmt = b'%Y-%m-%d %H:00' - elif freq_group == FR_MIN: - fmt = b'%Y-%m-%d %H:%M' - elif freq_group == FR_SEC: - fmt = b'%Y-%m-%d %H:%M:%S' - elif freq_group == FR_MS: - fmt = b'%Y-%m-%d %H:%M:%S.%l' - elif freq_group == FR_US: - fmt = b'%Y-%m-%d %H:%M:%S.%u' - elif freq_group == FR_NS: - fmt = b'%Y-%m-%d %H:%M:%S.%n' - else: - raise ValueError(f"Unknown freq: {freq}") + return _period_default_format(value, freq) + else: + if isinstance(fmt, str): + # Encode using current locale, in case fmt contains non-utf8 chars + fmt = util.string_encode_locale(fmt) + + return _period_strftime(value, freq, fmt) - return _period_strftime(value, freq, fmt) + +cdef str _period_default_format(int64_t value, int freq): + """A faster default formatting function leveraging string formatting.""" + + cdef: + int freq_group, quarter + npy_datetimestruct dts + + # fill dts + get_date_info(value, freq, &dts) + + # get the appropriate format depending on frequency group + freq_group = get_freq_group(freq) + if freq_group == FR_ANN: + # fmt = b'%Y' + return f"{dts.year}" + + elif freq_group == FR_QTR: + # fmt = b'%FQ%q' + # get quarter and modify dts.year to be the fiscal year (?) + quarter = get_yq(value, freq, &dts) + return f"{dts.year}Q{quarter}" + + elif freq_group == FR_MTH: + # fmt = b'%Y-%m' + return f"{dts.year}-{dts.month:02d}" + + elif freq_group == FR_WK: + # special: start_date/end_date. Recurse + left = period_asfreq(value, freq, FR_DAY, 0) + right = period_asfreq(value, freq, FR_DAY, 1) + return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" + + elif freq_group == FR_BUS or freq_group == FR_DAY: + # fmt = b'%Y-%m-%d' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d}" + + elif freq_group == FR_HR: + # fmt = b'%Y-%m-%d %H:00' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d} {dts.hour:02d}:00" + + elif freq_group == FR_MIN: + # fmt = b'%Y-%m-%d %H:%M' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}") + + elif freq_group == FR_SEC: + # fmt = b'%Y-%m-%d %H:%M:%S' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") + + elif freq_group == FR_MS: + # fmt = b'%Y-%m-%d %H:%M:%S.%l' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(dts.us // 1_000):03d}") + + elif freq_group == FR_US: + # fmt = b'%Y-%m-%d %H:%M:%S.%u' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(dts.us):06d}") + + elif freq_group == FR_NS: + # fmt = b'%Y-%m-%d %H:%M:%S.%n' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{((dts.us * 1000) + (dts.ps // 1000)):09d}") + + else: + raise ValueError(f"Unknown freq: {freq}") cdef list extra_fmts = [(b"%q", b"^`AB`^"), From a6c06c9010e86ba13be858b13ebe19233c6b15a4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 15:55:23 +0100 Subject: [PATCH 002/115] class `_Period`: new method `fast_strftime` --- pandas/_libs/tslibs/period.pyi | 1 + pandas/_libs/tslibs/period.pyx | 55 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 946ae1215f1e3..74aa969834ce2 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -82,6 +82,7 @@ class Period(PeriodMixin): def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: BaseOffset = ...) -> Period: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 63bdc763d1a5f..381e9cb7f8557 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2357,6 +2357,61 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + + See also `self.strftime`, that relies on `period_format`. + + Examples + -------- + + >>> from pandas._libs.tslibs import convert_strftime_format + >>> a = Period(freq='Q-JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> fast_fmt, loc_s = convert_strftime_format('%F-Q%q', target="period") + >>> a.fast_strftime(fast_fmt, loc_s) + '2006-Q1' + """ + freq = self._dtype._dtype_code + value = self.ordinal + + if value == NPY_NAT: + return "NaT" + + cdef: + npy_datetimestruct dts, dts2 + int quarter, y, h + + # Fill dts with all fields + get_date_info(value, freq, &dts) + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template + y = dts.year + h = dts.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } + def strftime(self, fmt: str) -> str: r""" Returns a formatted string representation of the :class:`Period`. From 34113d4d29d540f768f2d01b33a60c0e42803ce2 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 15:56:51 +0100 Subject: [PATCH 003/115] class `Timestamp`: new method `fast_strftime` --- pandas/_libs/tslibs/timestamps.pyi | 1 + pandas/_libs/tslibs/timestamps.pyx | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index e916d7eb12dbf..cef625dd56d1f 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -103,6 +103,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 3c3bb8496aa6e..4e6cbf6499968 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1432,6 +1432,36 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + + See also `self.strftime`, that relies on `datetime.strftime`. + + Examples + -------- + >>> from pandas._libs.tslibs import convert_strftime_format + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S') + >>> ts.fast_strftime(fmt, loc_s) + '2020-03-14T15:32:52' + """ + y = self.year + h = self.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": self.month, + "day": self.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": self.minute, + "sec": self.second, + "us": self.microsecond, + } + def strftime(self, format): """ Return a formatted string of the Timestamp. From 5dd7ab44850164149156a999a651a8e7352fb81d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 16:00:30 +0100 Subject: [PATCH 004/115] New module in tslibs: `strftime.py`. New function in this module: `convert_strftime_format` (raises `UnsupportedStrFmtDirective`). This function converts a `strftime` date format string into a native python formatting string --- pandas/_libs/tslibs/__init__.py | 6 + pandas/_libs/tslibs/strftime.py | 265 ++++++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 pandas/_libs/tslibs/strftime.py diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 42f84619ddbe5..f48ce1b03f058 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -9,6 +9,8 @@ "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "Resolution", "Timedelta", "normalize_i8_timestamps", @@ -67,6 +69,10 @@ IncompatibleFrequency, Period, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timedeltas import ( Timedelta, delta_to_nanoseconds, diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py new file mode 100644 index 0000000000000..2bcc69ae05e1c --- /dev/null +++ b/pandas/_libs/tslibs/strftime.py @@ -0,0 +1,265 @@ +"""Strftime-related classes and functions. +""" +from datetime import time +import locale +from typing import ( + Dict, + Tuple, +) + + +class UnsupportedStrFmtDirective(ValueError): + """The format contains a directive that is not supported in this context.""" + + +_COMMON_UNSUPPORTED = ( + # 1- Names not in the numpy or datetime attr representation + "%a", # Weekday as locale’s abbreviated name. + "%A", # Weekday as locale’s full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale’s abbreviated name. + "%B", # Month as locale’s full name. + # 2- TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] ("" if tz naive). + "%Z", # Time zone name ("" if tz naive). + # 3- Probably too complex ones for now + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Monday are considered to be in week 0. + "%c", # Locale’s appropriate date and time representation. + "%x", # Locale’s appropriate date representation. + "%X", # Locale’s appropriate time representation. +) + + +_COMMON_MAP = { + "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. + "%m": ("month", "02d"), # Month as a zero-padded decimal number. + "%Y": ("year", "d"), # Year with century as a decimal number. + "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. + "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. + "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. + "%p": ("ampm", "s"), # Locale’s equivalent of either AM or PM. + "%M": ("min", "02d"), # Minute as a zero-padded decimal number. + "%S": ("sec", "02d"), # Second as a zero-padded decimal number. +} + +_DATETIME_MAP = { + "%f": ("us", "06d"), # Microsecond as decimal number, 0-padded to 6 digits +} + +_PERIOD_MAP = { + "%f": ( + "fyear", + "02d", + ), # 'Fiscal' year without century as zero-padded decimal number [00,99] + "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number + "%q": ("q", "d"), # Quarter as a decimal number [1,4] + "%l": ("ms", "03d"), # Millisecond as decimal number, 0-padded 3 digits + "%u": ("us", "06d"), # Microsecond as decimal number, 0-padded 6 digits + "%n": ("ns", "09d"), # Nanosecond as decimal number, 0-padded 9 digits +} + + +class LocaleSpecificDtStrings: + """A container for date/time strings used in a specific locale. + + We will use these when formatting datetime as string using string templates, which + is faster than strftime when executed on arrays. + + `get_current_locale_specific_string()` is the recommended way to get an instance, + as it provides caching. + + Attributes + ---------- + am : str + Used in the %p strftime directive. Locale’s equivalent of AM. + pm : str + Used in the %p strftime directive. Locale’s equivalent of PM. + """ + + __slots__ = ("am", "pm") + + def __init__(self, am: str, pm: str): + self.am = am + self.pm = pm + + def __repr__(self): + attrs = ", ".join( + [f"{k}={repr(getattr(self, k))}" for k in type(self).__slots__] + ) + return f"{type(self).__name__}({attrs})" + + @classmethod + def get_current(cls): + return LocaleSpecificDtStrings( + am=time(1).strftime("%p"), + pm=time(13).strftime("%p"), + ) + + +_locale_specifics: Dict[str, LocaleSpecificDtStrings] = {} + + +def get_current_locale_specific_string() -> LocaleSpecificDtStrings: + """Return a `LocaleSpecificDtStrings` for the current locale. + + This function caches results in the `_locale_specifics` dict. + """ + global _locale_specifics + + # Get current locale + current_locale = locale.setlocale(locale.LC_ALL) + + try: + # Any entry in cache for current locale ? + return _locale_specifics[current_locale] + except KeyError: + # Create it using current locale, and cache it + o = LocaleSpecificDtStrings.get_current() + _locale_specifics[current_locale] = o + return o + + +def convert_strftime_format( + strftime_fmt: str, + target: str = "datetime", + new_style_fmt: bool = False, +) -> Tuple[str, LocaleSpecificDtStrings]: + """Convert a strftime formatting string into a formatting template string. + + The set of supported directives varies according to the `target`. + + This method can be tested on a single instance of + + - `datetime` or `Timestamp`, through + `pandas.core.tools.datetimes.fast_strftime`. The + result may be compared with `datetime.strftime` or `Timestamp.strftime` + + - `Period` through `Period.fast_strftime`. The result may be compared + with `Period.strftime`. + + On array-like objects, this method is used in several places: + + - Subclasses of `DatelikeOps` now rely on this method in their + `self.strftime(fmt, fast_strftime=True)` default implementation, which + delegates to `_format_native_types`. + + - `DatetimeArray._format_native_types` relies on + `tslib.format_array_from_datetime` which relies on this function + - `PeriodArray._format_native_types` directly relies on this function. + - `TimedeltaArray._format_native_types` does not currently support + custom formats. + + In addition, `Datetime64Formatter` and `Datetime64TZFormatter` also + rely on this when their attribute `fast_strftime` is `True` (default). + + Parameters + ---------- + strftime_fmt : str + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. + Note that not all directives are eligible to successful usage of string + formatting. Unsupported directives will lead to an + `UnsupportedStrFmtDirective` being raised. + target : { "datetime", "date", "time", "period" }, default: "datetime" + The kind of data that will be formatted using this template. + new_style_fmt : bool, default: False + Whether the output string should be new-style + e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" + or old-style + e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + + Returns + ------- + fmt_out : str + A string that may be used to format a `datetime` variable. The style of + this string is either old-style or new-style depending on + `new_style_formatting`. + For old-style, it may be used as `fmt_out % fmt_dct`. + For new-style, it may be used as `fmt_out.format(**fmt_dct)` + loc_s : LocaleSpecificDtStrings + An object containing the locale-specific strings needed for some of the + directives. For example loc_s.am and loc_s.pm should be used to fill the "ampm" + part of the template, induced by directive %p. + + Raises + ------ + UnsupportedStrFmtDirective + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. + + See Also + -------- + `strftime format codes reference `_ # noqa + + `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than datetime.strftime`. + + See `Period.strftime` doc for all the supported period directives (same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as specific additional directives ``%f``, ``%F``, + ``%q``, ``%l``, ``%u``, ``%n``). + """ + if target in ("datetime", "date", "time"): + directive_maps = (_COMMON_MAP, _DATETIME_MAP) + elif target == "period": + directive_maps = (_COMMON_MAP, _PERIOD_MAP) + else: + raise ValueError(f"Invalid target: {repr(target)}") + + # Raise if unsupported directive found in `strftime_fmt` + for key in _COMMON_UNSUPPORTED: + if key in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") + + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + esc = "/_+\\" + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc) + + esc_l = "+^_\\" + esc_r = "/_^+" + + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "{day:02d}" but with escaped { } + strftime_fmt = strftime_fmt.replace( + key, f"{esc_l}{_name}:{_fmt}{esc_r}" + ) + + # Restore the %% into % + strftime_fmt = strftime_fmt.replace(esc, "%") + + # Escape remaining curly braces + strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") + + # Finally replace our placeholders + strftime_fmt = strftime_fmt.replace(esc_l, "{").replace(esc_r, "}") + + else: + esc = "/_^+" + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc * 2) + + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "%(day)02d" but with escaped % + strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") + + # Escape remaining percent signs + strftime_fmt = strftime_fmt.replace("%", "%%") + + # Finally replace our placeholder + strftime_fmt = strftime_fmt.replace(esc, "%") + + return strftime_fmt, get_current_locale_specific_string() From 8a7c0395c0ded3276e9a923851c46fb4f879a364 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 16:04:40 +0100 Subject: [PATCH 005/115] `format_array_from_datetime`: new boolean argument `fast_strftime` to use faster datetime formatting. --- pandas/_libs/tslib.pyi | 1 + pandas/_libs/tslib.pyx | 48 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 8fec9ecf27f30..8f77286fb0f92 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,6 +10,7 @@ def format_array_from_datetime( format: str | None = ..., na_rep: object = ..., reso: int = ..., # NPY_DATETIMEUNIT + fast_strftime: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_with_unit_to_datetime( values: np.ndarray, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 03331f54db892..0deb77026006b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -68,6 +68,10 @@ from pandas._libs.tslibs import ( Resolution, get_resolution, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here @@ -112,6 +116,7 @@ def format_array_from_datetime( str format=None, object na_rep=None, NPY_DATETIMEUNIT reso=NPY_FR_ns, + fast_strftime=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -125,18 +130,22 @@ def format_array_from_datetime( na_rep : optional, default is None a nat format reso : NPY_DATETIMEUNIT, default NPY_FR_ns + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. Returns ------- np.ndarray[object] """ cdef: - int64_t val, ns, N = values.size + int64_t val, ns, y, h, N = values.size bint show_ms = False, show_us = False, show_ns = False bint basic_format = False, basic_format_day = False _Timestamp ts object res npy_datetimestruct dts + object str_format, loc_s # Note that `result` (and thus `result_flat`) is C-order and # `it` iterates C-order as well, so the iteration matches @@ -173,6 +182,18 @@ def format_array_from_datetime( # Default format for dates basic_format_day = True + elif fast_strftime: + if format is None: + # We'll fallback to the Timestamp.str method + fast_strftime = False + else: + try: + # Try to get the string formatting template for this format + str_format, loc_s = convert_strftime_format(format) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + assert not (basic_format_day and basic_format) for i in range(N): @@ -200,6 +221,31 @@ def format_array_from_datetime( elif show_ms: res += f'.{dts.us // 1000:03d}' + elif fast_strftime: + + if tz is None: + pandas_datetime_to_datetimestruct(val, reso, &dts) + + # Use string formatting for faster strftime + y = dts.year + h = dts.hour + result[i] = str_format % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": dts.hour, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dts.min, + "sec": dts.sec, + "us": dts.us, + } + else: + ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) + + # Use string formatting for faster strftime + res = ts.fast_strftime(str_format, loc_s) else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) From f2d2fb1ef03b609882d5afb86411a0d7099ab744 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 16:12:04 +0100 Subject: [PATCH 006/115] datetimelike `strftime`: new boolean argument `fast_strftime` to use faster datetime formatting. `_format_native_types` modified with this new argument too. Subclasses modified to support it (`DatetimeArray`, `PeriodArray`, `TimedeltaArray`, `DatetimeIndex`) --- pandas/core/arrays/datetimelike.py | 16 +++++++++++++--- pandas/core/arrays/datetimes.py | 9 +++++++-- pandas/core/arrays/period.py | 24 ++++++++++++++++++++++-- pandas/core/arrays/timedeltas.py | 4 ++-- pandas/core/indexes/datetimes.py | 4 ++-- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bcf4b5d58bf74..ccf3d8225b9b8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -309,7 +309,7 @@ def asi8(self) -> npt.NDArray[np.int64]: # Rendering Methods def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None + self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True ) -> npt.NDArray[np.object_]: """ Helper method for astype when converting to strings. @@ -1759,7 +1759,9 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime(self, date_format: str) -> npt.NDArray[np.object_]: + def strftime( + self, date_format: str, fast_strftime: bool = True + ) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1781,6 +1783,12 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: date_format : str Date format string (e.g. "%%Y-%%m-%%d"). + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. + + .. versionadded:: 1.5.4 + Returns ------- ndarray[object] @@ -1804,7 +1812,9 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan) + result = self._format_native_types( + date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime + ) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8395d54224f1d..6d943059c2929 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -703,14 +703,19 @@ def astype(self, dtype, copy: bool = True): # Rendering Methods def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso + self.asi8, + tz=self.tz, + format=fmt, + na_rep=na_rep, + reso=self._creso, + fast_strftime=fast_strftime, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5e1b0c4b18718..ee6ce99c4ef37 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -24,7 +24,9 @@ NaT, NaTType, Timedelta, + UnsupportedStrFmtDirective, astype_overflowsafe, + convert_strftime_format, dt64arr_to_periodarr as c_dt64arr_to_periodarr, get_unit_from_dtype, iNaT, @@ -642,16 +644,34 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs ) -> npt.NDArray[np.object_]: """ actually format my specific types + + TODO maybe rather align with the way it is done in datetimes.py ? + (delegate all to a tslib.format_array_from_period cython numpy method) """ values = self.astype(object) # Create the formatter function if date_format: - formatter = lambda per: per.strftime(date_format) + if fast_strftime: + try: + # Try to get the string formatting template for this format + str_format, loc_s = convert_strftime_format( + date_format, target="period" + ) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + + if fast_strftime: + # Faster: python string formatting + formatter = lambda per: per.fast_strftime(str_format, loc_s) + else: + # Slower: strftime + formatter = lambda per: per.strftime(date_format) else: # Uses `_Period.str` which in turn uses `format_period` formatter = lambda per: str(per) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 92b9222cfc9bc..f2eebc4427153 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -399,11 +399,11 @@ def _formatter(self, boxed: bool = False): return get_format_timedelta64(self, box=True) def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, **kwargs + self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 - # Relies on TimeDelta._repr_base + # Relies on TimeDelta._repr_base (and does use the `date_format` arg) formatter = get_format_timedelta64(self._ndarray, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 667deec23757f..4341b53b5ae6b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -266,8 +266,8 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format) -> Index: - arr = self._data.strftime(date_format) + def strftime(self, date_format, fast_strftime: bool = True) -> Index: + arr = self._data.strftime(date_format, fast_strftime=fast_strftime) return Index(arr, name=self.name, dtype=object) @doc(DatetimeArray.tz_convert) From fac90d704483fd06da6b4059950217800c5782e6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 16:13:25 +0100 Subject: [PATCH 007/115] `DatetimeIndexOpsMixin.format` and `_format_with_header`: new boolean argument `fast_strftime` to use faster datetime formatting. --- pandas/core/indexes/datetimelike.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c6c8695ab01da..8b38dbfaad9cf 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -162,6 +162,7 @@ def format( formatter: Callable | None = None, na_rep: str = "NaT", date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: """ Render a string representation of the Index. @@ -177,14 +178,22 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header( + header, na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, + header: list[str], + na_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( - self._format_native_types(na_rep=na_rep, date_format=date_format) + self._format_native_types( + na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) ) @property From 2fc70a6eca885746efb5b2be66d90c9f70026697 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 6 Feb 2023 16:18:08 +0100 Subject: [PATCH 008/115] `NDFrame.to_csv` and `DataFrameRenderer.to_csv` and `CSVFormatter.__init__`: new boolean argument `fast_strftime` to use faster datetime formatting. --- pandas/core/generic.py | 10 ++++++++ pandas/io/formats/csvs.py | 6 ++--- pandas/io/formats/format.py | 50 ++++++++++++++++++++++++++++++++----- 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a1f799ec5122a..267f96244c26e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3497,6 +3497,7 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., + fast_strftime: bool_t = ..., doublequote: bool_t = ..., escapechar: str | None = ..., decimal: str = ..., @@ -3524,6 +3525,7 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., + fast_strftime: bool_t = ..., doublequote: bool_t = ..., escapechar: str | None = ..., decimal: str = ..., @@ -3556,6 +3558,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool_t = True, doublequote: bool_t = True, escapechar: str | None = None, decimal: str = ".", @@ -3648,6 +3651,12 @@ def to_csv( Rows to write at a time. date_format : str, default None Format string for datetime objects. + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. + + .. versionadded:: 1.5.4 + doublequote : bool, default True Control quoting of `quotechar` inside a field. escapechar : str, default None @@ -3730,6 +3739,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6ab57b0cce2a4..95a8b17c35c99 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,12 +29,9 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCIndex, ABCMultiIndex, - ABCPeriodIndex, ) -from pandas.core.dtypes.missing import notna from pandas.core.indexes.api import Index @@ -63,6 +60,7 @@ def __init__( chunksize: int | None = None, quotechar: str | None = '"', date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions = None, @@ -86,6 +84,7 @@ def __init__( self.escapechar = escapechar self.lineterminator = lineterminator or os.linesep self.date_format = date_format + self.fast_strftime = fast_strftime self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) @@ -172,6 +171,7 @@ def _number_format(self) -> dict[str, Any]: "na_rep": self.na_rep, "float_format": self.float_format, "date_format": self.date_format, + "fast_strftime": self.fast_strftime, "quoting": self.quoting, "decimal": self.decimal, } diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index cdc21f04da43a..7867622d72f59 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -44,6 +44,8 @@ NaT, Timedelta, Timestamp, + UnsupportedStrFmtDirective, + convert_strftime_format, get_unit_from_dtype, iNaT, periods_per_day, @@ -1151,6 +1153,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, errors: str = "strict", @@ -1181,6 +1184,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, @@ -1623,11 +1627,13 @@ def __init__( values: np.ndarray | Series | DatetimeIndex | DatetimeArray, nat_rep: str = "NaT", date_format: None = None, + fast_strftime: bool = True, **kwargs, ) -> None: super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format + self.fast_strftime = fast_strftime def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" @@ -1640,7 +1646,9 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format + na_rep=self.nat_rep, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) return fmt_values.tolist() @@ -1782,28 +1790,53 @@ def _format_datetime64_dateonly( x: NaTType | Timestamp, nat_rep: str = "NaT", date_format: str | None = None, + str_date_fmt: str | None = None, + loc_s: object | None = None, ) -> str: if isinstance(x, NaTType): return nat_rep if date_format: - return x.strftime(date_format) + if str_date_fmt: + # Faster, using string formatting + return x.fast_strftime(str_date_fmt, loc_s) + else: + # Slower + return x.strftime(date_format) else: # Timestamp._date_repr relies on string formatting (faster than strftime) return x._date_repr def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, + nat_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" if is_dates_only: + str_date_fmt = loc_s = None + if date_format is not None and fast_strftime: + try: + # Try to get the string formatting template for this format + str_date_fmt, loc_s = convert_strftime_format(date_format) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + pass + return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format + x, + nat_rep=nat_rep, + date_format=date_format, + str_date_fmt=str_date_fmt, + loc_s=loc_s, ) else: + # Relies on datetime.str, which is fast already + # TODO why is date_format not used in this case ? This seems like a bug? return lambda x: _format_datetime64(x, nat_rep=nat_rep) @@ -1827,9 +1860,13 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" values = self.values.astype(object) - ido = is_dates_only(values) + # When there is a timezone `is_dates_only` always returns `False` since dates + # are not universal dates but have 00:00:00 timestamps in the given timezone. + assert not is_dates_only(values) formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format + is_dates_only=False, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) fmt_values = [formatter(x) for x in values] @@ -1849,6 +1886,7 @@ def __init__( self.box = box def _format_strings(self) -> list[str]: + # Note: `get_format_timedelta64` uses fast formatting formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) From 6edda539d597f90db531138ec550259c9643e48d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 14:04:33 +0100 Subject: [PATCH 009/115] Added tests for the `to_csv` dataframe method to cover the new fast_strftime feature --- pandas/tests/frame/methods/test_to_csv.py | 22 +++++--- pandas/tests/io/formats/test_to_csv.py | 63 +++++++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 1933278efb443..b8d227c9c43f5 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1005,13 +1005,16 @@ def test_to_csv_compression(self, df, encoding, compression): with tm.decompress_file(filename, compression) as fh: tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - def test_to_csv_date_format(self, datetime_frame): + @pytest.mark.parametrize("fast_strftime", (True, False)) + def test_to_csv_date_format(self, datetime_frame, fast_strftime): with tm.ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = datetime_frame.index datetime_frame = DataFrame( {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index ) - datetime_frame.to_csv(path, date_format="%Y%m%d") + datetime_frame.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1025,7 +1028,9 @@ def test_to_csv_date_format(self, datetime_frame): tm.assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format="%Y-%m-%d") + datetime_frame.to_csv( + path, date_format="%Y-%m-%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1040,7 +1045,9 @@ def test_to_csv_date_format(self, datetime_frame): # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + datetime_frame_columns.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) test = read_csv(path, index_col=0) @@ -1059,14 +1066,15 @@ def test_to_csv_date_format(self, datetime_frame): ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format="%Y-%m-%d") + nat_frame.to_csv(path, date_format="%Y-%m-%d", fast_strftime=fast_strftime) test = read_csv(path, parse_dates=[0, 1], index_col=0) tm.assert_frame_equal(test, nat_frame) + @pytest.mark.parametrize("fast_strftime", (True, False)) @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) - def test_to_csv_with_dst_transitions(self, td): + def test_to_csv_with_dst_transitions(self, td, fast_strftime): with tm.ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions @@ -1081,7 +1089,7 @@ def test_to_csv_with_dst_transitions(self, td): i = i._with_freq(None) # freq is not preserved by read_csv time_range = np.array(range(len(i)), dtype="int64") df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True) + df.to_csv(path, index=True, fast_strftime=fast_strftime) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 2b86e9c7b3de2..9889b6b19c640 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -282,6 +282,69 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_datetime_format_index(self): + """Test that formatting also works for datetime index""" + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_sec = df_sec.set_index("A") + + # default date_format + res = df_sec.to_csv() + expected_rows = [ + "A", + "2013-01-01 00:00:00", + "2013-01-01 00:00:01", + "2013-01-01 00:00:02", + "2013-01-01 00:00:03", + "2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + # custom date_format + res = df_sec.to_csv(date_format="%Y-%m-%d %H:%M:%S.%f") + expected_rows = [ + "A", + "2013-01-01 00:00:00.000000", + "2013-01-01 00:00:01.000000", + "2013-01-01 00:00:02.000000", + "2013-01-01 00:00:03.000000", + "2013-01-01 00:00:04.000000", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + def test_to_csv_period_format_index(self): + """Test that formatting also works for period index""" + # same for periods + df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="m")}) + df_month = df_month.set_index("A") + + # default date_format + res = df_month.to_csv() + expected_rows = [ + "A", + "2013-01", + "2013-02", + "2013-03", + "2013-04", + "2013-05", + ] + expected_default_mon = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_mon + + # custom format + res = df_month.to_csv(date_format="%F : %q") + expected_rows = [ + "A", + "2013 : 1", + "2013 : 1", + "2013 : 1", + "2013 : 2", + "2013 : 2", + ] + expected_ymdhms_month = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_ymdhms_month + def test_to_csv_different_datetime_formats(self): # GH#21734 df = DataFrame( From b4c815d3b759638dec741e75c61e3b58ad8aa549 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 14:06:22 +0100 Subject: [PATCH 010/115] `TestCategoricalRepr`: added a test for dates without time, with timezone --- pandas/tests/arrays/categorical/test_repr.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index b44af07cee01d..5c626f3321805 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -207,6 +207,22 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp + # same with dates only: since there is a timezone, dates become datetimes + idx = date_range("2011-01-01", freq="D", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00, " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00, " + "2011-01-05 00:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00,\n" + " " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00,\n" + " " + "2011-01-05 00:00:00-05:00]" + ) + assert repr(c) == exp + def test_categorical_repr_datetime_ordered(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) From eaa1dc9b9c056cd25ffe489ae58fd8440dbece01 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 14:07:08 +0100 Subject: [PATCH 011/115] Fixed `test_nat` and `test_api` with the new symbols added --- pandas/tests/scalar/test_nat.py | 11 ++++++++++- pandas/tests/tslibs/test_api.py | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 1fd5f5ab7c2a6..0a698d6c4d4fd 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -190,7 +190,16 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize( "klass,expected", [ - (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period"]), + ( + Timestamp, + [ + "fast_strftime", + "freqstr", + "normalize", + "to_julian_date", + "to_period", + ], + ), ( Timedelta, [ diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 264662a7e93cd..76009b78ce076 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -17,6 +17,7 @@ def test_namespace(): "parsing", "period", "strptime", + "strftime", "vectorized", "timedeltas", "timestamps", @@ -33,6 +34,8 @@ def test_namespace(): "OutOfBoundsDatetime", "OutOfBoundsTimedelta", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "IncompatibleFrequency", "Resolution", "Tick", From 3254b54f9b62bfa9b53c282d3ec28384a08b0b59 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:28:43 +0100 Subject: [PATCH 012/115] New `test_strftime` module to cover the `strftime.py` module in tslib. --- pandas/tests/tslibs/test_strftime.py | 137 +++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 pandas/tests/tslibs/test_strftime.py diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py new file mode 100644 index 0000000000000..76a286e13eb8f --- /dev/null +++ b/pandas/tests/tslibs/test_strftime.py @@ -0,0 +1,137 @@ +""" +Test datetime formatting low-level routines +""" +from contextlib import nullcontext + +from datetime import time + +import locale +import pytest + +from pandas._libs.tslibs import convert_strftime_format +from pandas._libs.tslibs.strftime import get_current_locale_specific_string, \ + UnsupportedStrFmtDirective +import pandas._testing as tm + + +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + +@pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], +) +def test_get_current_locale_specific_string(locale_str): + """Test that `get_current_locale_specific_string` relies on runtime locale.""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Test that the function returns the correct ones + res = get_current_locale_specific_string() + assert res.am == am_local + assert res.pm == pm_local + + +class TestConvertStrftimeFormat: + """Tests for `convert_strftime_format`.""" + + @pytest.mark.parametrize("strftime_fmt,res_fmt_old,res_fmt_new", ( + ("%p", "%(ampm)s", "{ampm:s}"), + ("%m-%d-%Y", "%(month)02d-%(day)02d-%(year)d", "{month:02d}-{day:02d}-{year:d}"), + ("20%y-%m-%d__foo__%I:%M:%S%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}") + )) + def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): + """Test that `convert_strftime_format` returns the correct formatting template""" + str_tmp, loc_s = convert_strftime_format(strftime_fmt, target="datetime", + new_style_fmt=False) + assert str_tmp == res_fmt_old + + str_tmp_new, loc_s2 = convert_strftime_format(strftime_fmt, target="datetime", + new_style_fmt=True) + assert loc_s2 == loc_s + assert str_tmp_new == res_fmt_new + + @pytest.mark.parametrize("strftime_fmt,res_fmt_old,res_fmt_new", ( + ("%p", "%(ampm)s", "{ampm:s}"), + ("%m-%d-%Y", "%(month)02d-%(day)02d-%(year)d", + "{month:02d}-{day:02d}-{year:d}"), + ("%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s (ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", + "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} (ms={ms:03d} us={us:06d} ns={ns:09d})"), + ("20%y-%m-%d__foo__%I:%M:%S%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}") + )) + def test_format_period(self, strftime_fmt, res_fmt_old, res_fmt_new): + """Test that `convert_strftime_format` returns the correct formatting template""" + str_tmp, loc_s = convert_strftime_format(strftime_fmt, target="period", + new_style_fmt=False) + assert str_tmp == res_fmt_old + + str_tmp_new, loc_s2 = convert_strftime_format(strftime_fmt, target="period", + new_style_fmt=True) + assert loc_s2 == loc_s + assert str_tmp_new == res_fmt_new + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + @pytest.mark.parametrize("target", ("datetime", "date", "time", "period")) + def test_format_non_ascii(self, locale_str, target): + """Test that `convert_strftime_format` is robust to locale and format encoding""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + strftime_fmt = "%y é" + + str_tmp, _ = convert_strftime_format(strftime_fmt, target="datetime", + new_style_fmt=False) + assert str_tmp == "%(shortyear)02d é" + + str_tmp_new, _ = convert_strftime_format(strftime_fmt, target="datetime", + new_style_fmt=True) + assert str_tmp_new == "{shortyear:02d} é" + + def test_invalid_datetime_directive(self): + """Test that using invalid strftime directives for datetime raises an error""" + with pytest.raises(UnsupportedStrFmtDirective): + convert_strftime_format("%F", target="datetime") + + def test_unknown_directive(self): + """Test that unknown directives (non strftime) are simply escaped.""" + res_str, _ = convert_strftime_format("%O", target="datetime") + assert res_str == "%%O" + + res_str, _ = convert_strftime_format("%O", target="datetime", + new_style_fmt=True) + assert res_str == "%O" From 0f6928662aff162c8a5526575dbfbf37cec054b1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:31:02 +0100 Subject: [PATCH 013/115] `convert_strftime_format` argument `target` is now mandatory to avoid mistakes --- pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 3 ++- pandas/_libs/tslibs/strftime.py | 2 +- pandas/_libs/tslibs/timestamps.pyx | 5 +++-- pandas/io/formats/format.py | 3 ++- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0deb77026006b..68ad299676c58 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -189,7 +189,7 @@ def format_array_from_datetime( else: try: # Try to get the string formatting template for this format - str_format, loc_s = convert_strftime_format(format) + str_format, loc_s = convert_strftime_format(format, target="datetime") except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 381e9cb7f8557..286104d989eb1 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2360,7 +2360,8 @@ cdef class _Period(PeriodMixin): def fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + `fmt_str` and `loc_s` should be created using + `convert_strftime_format(fmt, target="period")`. See also `self.strftime`, that relies on `period_format`. diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 2bcc69ae05e1c..41c9d53ed7ccf 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -127,7 +127,7 @@ def get_current_locale_specific_string() -> LocaleSpecificDtStrings: def convert_strftime_format( strftime_fmt: str, - target: str = "datetime", + target: str, new_style_fmt: bool = False, ) -> Tuple[str, LocaleSpecificDtStrings]: """Convert a strftime formatting string into a formatting template string. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 4e6cbf6499968..db424c71598e1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1435,7 +1435,8 @@ class Timestamp(_Timestamp): def fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + `fmt_str` and `loc_s` should be created using + `convert_strftime_format(fmt, target="datetime")`. See also `self.strftime`, that relies on `datetime.strftime`. @@ -1443,7 +1444,7 @@ class Timestamp(_Timestamp): -------- >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') - >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S') + >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S', target="datetime") >>> ts.fast_strftime(fmt, loc_s) '2020-03-14T15:32:52' """ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7867622d72f59..b2ce062783249 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1822,7 +1822,8 @@ def get_format_datetime64( if date_format is not None and fast_strftime: try: # Try to get the string formatting template for this format - str_date_fmt, loc_s = convert_strftime_format(date_format) + str_date_fmt, loc_s = convert_strftime_format(date_format, + target="datetime") except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass From 72fe3791f30c08d127e9f34401f09928b1eca23f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:32:00 +0100 Subject: [PATCH 014/115] `convert_strftime_format`: Completed unsupported directives for datetimes --- pandas/_libs/tslibs/strftime.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 41c9d53ed7ccf..0567e9c3c647b 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -51,6 +51,13 @@ class UnsupportedStrFmtDirective(ValueError): _DATETIME_MAP = { "%f": ("us", "06d"), # Microsecond as decimal number, 0-padded to 6 digits } +_DATETIME_UNSUPPORTED = ( + "%F", + "%q", + "%l", + "%u", + "%n", +) _PERIOD_MAP = { "%f": ( @@ -63,6 +70,7 @@ class UnsupportedStrFmtDirective(ValueError): "%u": ("us", "06d"), # Microsecond as decimal number, 0-padded 6 digits "%n": ("ns", "09d"), # Nanosecond as decimal number, 0-padded 9 digits } +_PERIOD_UNSUPPORTED = () class LocaleSpecificDtStrings: @@ -207,15 +215,18 @@ def convert_strftime_format( """ if target in ("datetime", "date", "time"): directive_maps = (_COMMON_MAP, _DATETIME_MAP) + unsupported = (_COMMON_UNSUPPORTED, _DATETIME_UNSUPPORTED) elif target == "period": directive_maps = (_COMMON_MAP, _PERIOD_MAP) + unsupported = (_COMMON_UNSUPPORTED, _PERIOD_UNSUPPORTED) else: raise ValueError(f"Invalid target: {repr(target)}") # Raise if unsupported directive found in `strftime_fmt` - for key in _COMMON_UNSUPPORTED: - if key in strftime_fmt: - raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") + for _u in unsupported: + for key in _u: + if key in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") # Mapping between strftime and string formatting, according to both styles if new_style_fmt: From eda424306872fabefde437d73d7a412e784f12c8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:32:50 +0100 Subject: [PATCH 015/115] Fixed bug in tslib `format_array_from_datetime` --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 68ad299676c58..ee822ee1aa83c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -229,7 +229,7 @@ def format_array_from_datetime( # Use string formatting for faster strftime y = dts.year h = dts.hour - result[i] = str_format % { + res = str_format % { "year": y, "shortyear": y % 100, "month": dts.month, From 442732fc550b17be8e79c701a61ac9f7af24f194 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:41:31 +0100 Subject: [PATCH 016/115] Fixed issue in `format_array_from_datetime` when tz was not None --- pandas/_libs/tslib.pyx | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ee822ee1aa83c..563ac88a3aeb4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -182,20 +182,22 @@ def format_array_from_datetime( # Default format for dates basic_format_day = True - elif fast_strftime: - if format is None: - # We'll fallback to the Timestamp.str method - fast_strftime = False - else: - try: - # Try to get the string formatting template for this format - str_format, loc_s = convert_strftime_format(format, target="datetime") - except UnsupportedStrFmtDirective: - # Unsupported directive: fallback to standard `strftime` - fast_strftime = False - + # Sanity check - these flags are exclusive assert not (basic_format_day and basic_format) + if not basic_format_day and not basic_format and fast_strftime: + # Preprocessing for fast_strftime + if format is None: + # We'll fallback to the Timestamp.str method + fast_strftime = False + else: + try: + # Try to get the string formatting template for this format + str_format, loc_s = convert_strftime_format(format, target="datetime") + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + for i in range(N): # Analogous to: utc_val = values[i] val = (cnp.PyArray_ITER_DATA(it))[0] From 5ae370700406cd6add3cbfd582c9eb639f8d6c89 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 15:43:44 +0100 Subject: [PATCH 017/115] Added 2 todos --- pandas/_libs/tslibs/timedeltas.pyx | 1 + pandas/io/formats/csvs.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f3de67b705d4d..fc78d04f52cda 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1403,6 +1403,7 @@ cdef class _Timedelta(timedelta): comp_dict = self.components._asdict() comp_dict['sign'] = sign + # TODO make marginally faster using old-style python formatting ? return fmt.format(**comp_dict) def __repr__(self) -> str: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 95a8b17c35c99..eaa35babe6f07 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -183,6 +183,7 @@ def data_index(self) -> Index: isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and self.date_format is not None ): + # TODO This branch seems unreachable, remove the if ? data_index = Index( [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) From 62aca61f8f659e400a75536ee910a4820169ba17 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Feb 2023 16:05:45 +0100 Subject: [PATCH 018/115] `test_format`: Added various tests for the new feature --- pandas/tests/io/formats/test_format.py | 169 ++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b3e2e81e95613..bd12c538fe890 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -22,6 +22,7 @@ import pytest import pytz +from pandas._libs.tslibs import convert_strftime_format from pandas.compat import ( IS64, is_platform_windows, @@ -3174,6 +3175,81 @@ def test_str(self): assert str(NaT) == "NaT" +class TestFastStrfTimeScalars: + """ + Test that `convert_strftime_format` and `fast_strftime` + work well together and rely on runtime locale + """ + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_timestamp_locale(self, locale_str): + """Test for Timestamps""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="datetime") + assert str_tmp == "%(ampm)s" + + # Now what about the classes ? + # Timestamp + am_ts = Timestamp(2020, 1, 1, 1) + assert am_local == am_ts.strftime("%p") + assert am_local == am_ts.fast_strftime(str_tmp, loc_s) + pm_ts = Timestamp(2020, 1, 1, 13) + assert pm_local == pm_ts.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_period_locale(self, locale_str): + """Test for Periods""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="period") + assert str_tmp == "%(ampm)s" + + # Period + am_per = pd.Period("2018-03-11 01:00", freq="H") + assert am_local == am_per.strftime("%p") + assert am_local == am_per.fast_strftime(str_tmp, loc_s) + pm_per = pd.Period("2018-03-11 13:00", freq="H") + assert pm_local == pm_per.strftime("%p") + assert pm_local == pm_per.fast_strftime(str_tmp, loc_s) + + class TestPeriodIndexFormat: def test_period_format_and_strftime_default(self): per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="H") @@ -3193,27 +3269,70 @@ def test_period_format_and_strftime_default(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" - def test_period_custom(self): + @pytest.mark.parametrize("fast_strftime", (False, True)) + def test_period_custom(self, fast_strftime): # GH#46252 custom formatting directives %l (ms) and %u (us) # 3 digits per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" + @pytest.mark.parametrize("fast_strftime", (False, True)) + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_period_custom_pm(self, fast_strftime, locale_str): + """Test that using %p in the custom format work well""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # 9 digits + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime + ) + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" + # fmt: on + def test_period_tz(self): # Formatting periods created from a datetime with timezone. @@ -3296,10 +3415,52 @@ def test_period_custom_locale_directive(self, locale_str): class TestDatetimeIndexFormat: def test_datetime(self): + """Test default `format()` with tz-naive datetime index.""" formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" + def test_datetime_tz(self): + """Test default `format()` with tz-aware datetime index.""" + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # Since tz is currently set as utc, we'll see 2012 + assert dt.format()[0] == "2012-12-31 23:00:00+00:00" + # If we set tz as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert dt.format()[0] == "2013-01-01 00:00:00+01:00" + + def test_datetime_tz_custom(self): + """Test `format()` with tz-aware dt and a custom format string.""" + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + + # If tz is currently set as utc, we'll see 2012 + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2012-12-31__foo__23:00:00" + ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2012-12-31__foo__11:00:00{pm_local}" + ) + + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2013-01-01__foo__00:00:00" + ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2013-01-01__foo__12:00:00{am_local}" + ) + def test_date(self): formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" From 2ec16ba29325ceb446a705bed4a4df57cea670b6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 8 Feb 2023 14:31:38 +0100 Subject: [PATCH 019/115] Fixed Datetime64TZFormatter issue due to arg renaming in recent commits --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 422e24eee06b8..6dad378ee77d8 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1875,7 +1875,7 @@ def _format_strings(self) -> list[str]: # are not universal dates but have 00:00:00 timestamps in the given timezone. assert not is_dates_only(values) formatter = self.formatter or get_format_datetime64( - is_dates_only=False, + is_dates_only_=False, date_format=self.date_format, fast_strftime=self.fast_strftime, ) From c325431a58061c03677b99623b03e2d2401a6118 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 8 Feb 2023 15:13:57 +0100 Subject: [PATCH 020/115] Added 2 asv benchs for strftime with iso8601 format, and a variant for tz-aware --- asv_bench/benchmarks/strftime.py | 57 ++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index ac1b7f65d2d90..45a6ba412b99a 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -6,10 +6,10 @@ class DatetimeStrftime: timeout = 1500 - params = [1000, 10000] - param_names = ["obs"] + params = ([1000, 10000], [False, True]) + param_names = ["obs", "tz_aware"] - def setup(self, obs): + def setup(self, obs, tz_aware): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = pd.DataFrame( @@ -19,31 +19,45 @@ def setup(self, obs): "r": [np.random.uniform()] * obs, } ) + if tz_aware: + self.data["dt"] = self.data["dt"].dt.tz_localize("UTC") + self.data["d"] = self.data["d"].dt.tz_localize("UTC") - def time_frame_date_to_str(self, obs): + def time_frame_date_to_str(self, obs, tz_aware): self.data["d"].astype(str) - def time_frame_date_formatting_default(self, obs): + def time_frame_date_formatting_default(self, obs, tz_aware): self.data["d"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_date_formatting_custom(self, obs): + def time_frame_date_formatting_custom(self, obs, tz_aware): self.data["d"].dt.strftime(date_format="%Y---%m---%d") - def time_frame_datetime_to_str(self, obs): + def time_frame_datetime_to_str(self, obs, tz_aware): self.data["dt"].astype(str) - def time_frame_datetime_formatting_default_date_only(self, obs): + def time_frame_datetime_formatting_default_date_only(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_datetime_formatting_default(self, obs): + def time_frame_datetime_formatting_default(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") - def time_frame_datetime_formatting_default_with_float(self, obs): + def time_frame_datetime_formatting_default_with_float(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") - def time_frame_datetime_formatting_custom(self, obs): + def time_frame_datetime_formatting_custom(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + def time_frame_datetime_formatting_iso8601_map(self, obs, tz_aware): + self.data["dt"].map(lambda timestamp: timestamp.isoformat()) + + def time_frame_datetime_formatting_iso8601_strftime(self, obs, tz_aware): + self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + + # def time_frame_datetime_formatting_iso8601_isoformat(self, obs, tz_aware): + # TODO this PR is probably a good opportunity to add this too, or maybe + # another PR + # self.data["dt"].dt.isoformat() + class BusinessHourStrftime: timeout = 1500 @@ -62,3 +76,24 @@ def time_frame_offset_str(self, obs): def time_frame_offset_repr(self, obs): self.data["off"].apply(repr) + + +if __name__ == '__main__': + for cls in (DatetimeStrftime, BusinessHourStrftime): + all_params = dict() + all_p_values = cls.params + if len(cls.param_names) == 1: + all_p_values = (all_p_values, ) + for p_name, p_values in zip(cls.param_names, all_p_values): + all_params[p_name] = p_values + + from itertools import product + for case in product(*all_params.values()): + p_dict = {p_name: p_val for p_name, p_val in zip(all_params.keys(), case)} + print(f"{cls.__name__} - {p_dict}") + o = cls() + o.setup(**p_dict) + for m_name, m in cls.__dict__.items(): + if callable(m): + print(m_name) + m(o, **p_dict) From 6c1188a7c153ed21b3081277c541f08f3e335c0c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 8 Feb 2023 19:04:00 +0100 Subject: [PATCH 021/115] blackened, flake8, and removed asv main --- asv_bench/benchmarks/strftime.py | 21 ------ pandas/_libs/tslibs/strftime.py | 1 - pandas/core/arrays/datetimes.py | 7 +- pandas/core/arrays/period.py | 7 +- pandas/core/arrays/timedeltas.py | 7 +- pandas/io/formats/csvs.py | 3 + pandas/io/formats/format.py | 5 +- pandas/tests/io/formats/test_format.py | 11 ++- pandas/tests/tslibs/test_strftime.py | 100 ++++++++++++++++--------- 9 files changed, 96 insertions(+), 66 deletions(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 45a6ba412b99a..9fa7e7000e32f 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -76,24 +76,3 @@ def time_frame_offset_str(self, obs): def time_frame_offset_repr(self, obs): self.data["off"].apply(repr) - - -if __name__ == '__main__': - for cls in (DatetimeStrftime, BusinessHourStrftime): - all_params = dict() - all_p_values = cls.params - if len(cls.param_names) == 1: - all_p_values = (all_p_values, ) - for p_name, p_values in zip(cls.param_names, all_p_values): - all_params[p_name] = p_values - - from itertools import product - for case in product(*all_params.values()): - p_dict = {p_name: p_val for p_name, p_val in zip(all_params.keys(), case)} - print(f"{cls.__name__} - {p_dict}") - o = cls() - o.setup(**p_dict) - for m_name, m in cls.__dict__.items(): - if callable(m): - print(m_name) - m(o, **p_dict) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 0567e9c3c647b..82e7039718bb7 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -118,7 +118,6 @@ def get_current_locale_specific_string() -> LocaleSpecificDtStrings: This function caches results in the `_locale_specifics` dict. """ - global _locale_specifics # Get current locale current_locale = locale.setlocale(locale.LC_ALL) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8af0a538978cc..1b5d0f4ed3758 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -704,7 +704,12 @@ def astype(self, dtype, copy: bool = True): # Rendering Methods def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs + self, + *, + na_rep: str | float = "NaT", + date_format=None, + fast_strftime=True, + **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5a11b3764f148..f795ab8e60e4e 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -621,7 +621,12 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs + self, + *, + na_rep: str | float = "NaT", + date_format=None, + fast_strftime=True, + **kwargs, ) -> npt.NDArray[np.object_]: """ actually format my specific types diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 9fb0609280641..cfffb89f30c9a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -439,7 +439,12 @@ def _formatter(self, boxed: bool = False): return get_format_timedelta64(self, box=True) def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True, **kwargs + self, + *, + na_rep: str | float = "NaT", + date_format=None, + fast_strftime=True, + **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5e46e5ac64cc6..774dd2f7c6dd9 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,9 +29,12 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, ABCIndex, ABCMultiIndex, + ABCPeriodIndex, ) +from pandas.core.dtypes.missing import notna from pandas.core.indexes.api import Index diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6dad378ee77d8..f0f0ba6a9fbbb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1833,8 +1833,9 @@ def get_format_datetime64( if date_format is not None and fast_strftime: try: # Try to get the string formatting template for this format - str_date_fmt, loc_s = convert_strftime_format(date_format, - target="datetime") + str_date_fmt, loc_s = convert_strftime_format( + date_format, target="datetime" + ) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index c79cbb66dca82..ce035c46a6690 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3164,6 +3164,7 @@ class TestFastStrfTimeScalars: Test that `convert_strftime_format` and `fast_strftime` work well together and rely on runtime locale """ + @pytest.mark.parametrize( "locale_str", [ @@ -3311,10 +3312,14 @@ def test_period_custom_pm(self, fast_strftime, locale_str): p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime + fast_strftime=fast_strftime, + ) + assert ( + formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" + ) + assert ( + formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" ) - assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" - assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" # fmt: on def test_period_tz(self): diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 76a286e13eb8f..c969fec508e89 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -2,15 +2,17 @@ Test datetime formatting low-level routines """ from contextlib import nullcontext - from datetime import time - import locale + import pytest from pandas._libs.tslibs import convert_strftime_format -from pandas._libs.tslibs.strftime import get_current_locale_specific_string, \ - UnsupportedStrFmtDirective +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + get_current_locale_specific_string, +) + import pandas._testing as tm @@ -52,43 +54,66 @@ def test_get_current_locale_specific_string(locale_str): class TestConvertStrftimeFormat: """Tests for `convert_strftime_format`.""" - @pytest.mark.parametrize("strftime_fmt,res_fmt_old,res_fmt_new", ( - ("%p", "%(ampm)s", "{ampm:s}"), - ("%m-%d-%Y", "%(month)02d-%(day)02d-%(year)d", "{month:02d}-{day:02d}-{year:d}"), - ("20%y-%m-%d__foo__%I:%M:%S%p", - "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", - "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}") - )) + @pytest.mark.parametrize( + "strftime_fmt,res_fmt_old,res_fmt_new", + ( + ("%p", "%(ampm)s", "{ampm:s}"), + ( + "%m-%d-%Y", + "%(month)02d-%(day)02d-%(year)d", + "{month:02d}-{day:02d}-{year:d}", + ), + ( + "20%y-%m-%d__foo__%I:%M:%S%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + ), + ), + ) def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): """Test that `convert_strftime_format` returns the correct formatting template""" - str_tmp, loc_s = convert_strftime_format(strftime_fmt, target="datetime", - new_style_fmt=False) + str_tmp, loc_s = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=False + ) assert str_tmp == res_fmt_old - str_tmp_new, loc_s2 = convert_strftime_format(strftime_fmt, target="datetime", - new_style_fmt=True) + str_tmp_new, loc_s2 = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=True + ) assert loc_s2 == loc_s assert str_tmp_new == res_fmt_new - @pytest.mark.parametrize("strftime_fmt,res_fmt_old,res_fmt_new", ( + @pytest.mark.parametrize( + "strftime_fmt,res_fmt_old,res_fmt_new", + ( ("%p", "%(ampm)s", "{ampm:s}"), - ("%m-%d-%Y", "%(month)02d-%(day)02d-%(year)d", - "{month:02d}-{day:02d}-{year:d}"), - ("%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s (ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", - "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} (ms={ms:03d} us={us:06d} ns={ns:09d})"), - ("20%y-%m-%d__foo__%I:%M:%S%p", - "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", - "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}") - )) + ( + "%m-%d-%Y", + "%(month)02d-%(day)02d-%(year)d", + "{month:02d}-{day:02d}-{year:d}", + ), + ( + "%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s (ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", + "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} (ms={ms:03d} us={us:06d} ns={ns:09d})", + ), + ( + "20%y-%m-%d__foo__%I:%M:%S%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + ), + ), + ) def test_format_period(self, strftime_fmt, res_fmt_old, res_fmt_new): """Test that `convert_strftime_format` returns the correct formatting template""" - str_tmp, loc_s = convert_strftime_format(strftime_fmt, target="period", - new_style_fmt=False) + str_tmp, loc_s = convert_strftime_format( + strftime_fmt, target="period", new_style_fmt=False + ) assert str_tmp == res_fmt_old - str_tmp_new, loc_s2 = convert_strftime_format(strftime_fmt, target="period", - new_style_fmt=True) + str_tmp_new, loc_s2 = convert_strftime_format( + strftime_fmt, target="period", new_style_fmt=True + ) assert loc_s2 == loc_s assert str_tmp_new == res_fmt_new @@ -114,17 +139,19 @@ def test_format_non_ascii(self, locale_str, target): with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): strftime_fmt = "%y é" - str_tmp, _ = convert_strftime_format(strftime_fmt, target="datetime", - new_style_fmt=False) + str_tmp, _ = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=False + ) assert str_tmp == "%(shortyear)02d é" - str_tmp_new, _ = convert_strftime_format(strftime_fmt, target="datetime", - new_style_fmt=True) + str_tmp_new, _ = convert_strftime_format( + strftime_fmt, target="datetime", new_style_fmt=True + ) assert str_tmp_new == "{shortyear:02d} é" def test_invalid_datetime_directive(self): """Test that using invalid strftime directives for datetime raises an error""" - with pytest.raises(UnsupportedStrFmtDirective): + with pytest.raises(UnsupportedStrFmtDirective, match="Unsupported directive"): convert_strftime_format("%F", target="datetime") def test_unknown_directive(self): @@ -132,6 +159,7 @@ def test_unknown_directive(self): res_str, _ = convert_strftime_format("%O", target="datetime") assert res_str == "%%O" - res_str, _ = convert_strftime_format("%O", target="datetime", - new_style_fmt=True) + res_str, _ = convert_strftime_format( + "%O", target="datetime", new_style_fmt=True + ) assert res_str == "%O" From 1fad7b6e30276053078771e4d4741a98817df2b4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 10 Feb 2023 15:27:09 +0100 Subject: [PATCH 022/115] Minor improvement --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fdddec1de42c4..8397c416b38ee 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -229,7 +229,7 @@ def format_array_from_datetime( "shortyear": y % 100, "month": dts.month, "day": dts.day, - "hour": dts.hour, + "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), "ampm": loc_s.pm if (h // 12) else loc_s.am, "min": dts.min, From e33ad6573f94254f8fc4f21f9fd5cbdf59280aa0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 10 Feb 2023 15:59:14 +0100 Subject: [PATCH 023/115] Added new ASVs for strftime --- asv_bench/benchmarks/strftime.py | 45 +++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 9fa7e7000e32f..3017288781ff8 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -50,15 +50,58 @@ def time_frame_datetime_formatting_custom(self, obs, tz_aware): def time_frame_datetime_formatting_iso8601_map(self, obs, tz_aware): self.data["dt"].map(lambda timestamp: timestamp.isoformat()) - def time_frame_datetime_formatting_iso8601_strftime(self, obs, tz_aware): + def time_frame_datetime_formatting_iso8601_strftime_Z(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + def time_frame_datetime_formatting_iso8601_strftime_offset(self, obs, tz_aware): + """Not optimized yet as %z is not supported by `convert_strftime_format`""" + self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") + # def time_frame_datetime_formatting_iso8601_isoformat(self, obs, tz_aware): # TODO this PR is probably a good opportunity to add this too, or maybe # another PR # self.data["dt"].dt.isoformat() +class PeriodStrftime: + timeout = 1500 + params = ([1000, 10000], ["D", "H"]) + param_names = ["obs", "fq"] + + def setup(self, obs, fq): + self.data = pd.DataFrame( + { + "p": pd.period_range(start="2000-01-01", periods=obs, freq=fq), + "r": [np.random.uniform()] * obs, + } + ) + + def time_frame_period_to_str(self, obs, fq): + self.data["p"].astype(str) + + def time_frame_period_formatting_default_date_only(self, obs, fq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_period_formatting_default(self, obs, fq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_period_formatting_default_with_float(self, obs, fq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + + def time_frame_period_formatting_custom(self, obs, fq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + # def time_frame_period_formatting_iso8601_map(self, obs, fq): + # self.data["p"].map(lambda p: p.isoformat()) + + def time_frame_period_formatting_iso8601_strftime_Z(self, obs, fq): + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + + def time_frame_period_formatting_iso8601_strftime_offset(self, obs, fq): + """Not optimized yet as %z is not supported by `convert_strftime_format`""" + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") + + class BusinessHourStrftime: timeout = 1500 params = [1000, 10000] From 645669432b6693206b3f3b4943dd9141bebed208 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 10 Feb 2023 16:15:34 +0100 Subject: [PATCH 024/115] Added asvs for period --- asv_bench/benchmarks/tslibs/period.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index af10102749627..b1aa40ffc8a20 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -72,6 +72,12 @@ def time_now(self, freq): def time_asfreq(self, freq): self.per.asfreq("A") + def time_str(self, freq): + str(self.per) + + def time_repr(self, freq): + repr(self.per) + class PeriodConstructor: params = [["D"], [True, False]] From 91ad1948f4e1fde68e734ef90d9f0f18a6b4796e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 14 Feb 2023 09:50:11 +0100 Subject: [PATCH 025/115] Added asvs for datetime and period indexes .format --- asv_bench/benchmarks/strftime.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 3017288781ff8..2ebd807a086d7 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -29,9 +29,15 @@ def time_frame_date_to_str(self, obs, tz_aware): def time_frame_date_formatting_default(self, obs, tz_aware): self.data["d"].dt.strftime(date_format="%Y-%m-%d") + def time_frame_date_formatting_index_default(self, obs, tz_aware): + self.data.set_index("d").index.format() + def time_frame_date_formatting_custom(self, obs, tz_aware): self.data["d"].dt.strftime(date_format="%Y---%m---%d") + def time_frame_date_formatting_index_custom(self, obs, tz_aware): + self.data.set_index("d").index.format(date_format="%Y---%m---%d") + def time_frame_datetime_to_str(self, obs, tz_aware): self.data["dt"].astype(str) @@ -44,9 +50,15 @@ def time_frame_datetime_formatting_default(self, obs, tz_aware): def time_frame_datetime_formatting_default_with_float(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + def time_frame_datetime_formatting_index_default(self, obs, tz_aware): + self.data.set_index("dt").index.format() + def time_frame_datetime_formatting_custom(self, obs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + def time_frame_datetime_formatting_index_custom(self, obs, tz_aware): + self.data.set_index("dt").index.format(date_format="%Y-%m-%d --- %H:%M:%S") + def time_frame_datetime_formatting_iso8601_map(self, obs, tz_aware): self.data["dt"].map(lambda timestamp: timestamp.isoformat()) @@ -85,6 +97,9 @@ def time_frame_period_formatting_default_date_only(self, obs, fq): def time_frame_period_formatting_default(self, obs, fq): self.data["p"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + def time_frame_period_formatting_index_default(self, obs, fq): + self.data.set_index("p").index.format() + def time_frame_period_formatting_default_with_float(self, obs, fq): self.data["p"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") From 80ebc824cc5bd67b501aaf6d0d1829ea6fb88de5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 14 Feb 2023 09:55:17 +0100 Subject: [PATCH 026/115] `convert_strftime_format` is now part of the public API as it is required by `Period.fast_strftime` and `Timestamp.fast_strftime` --- pandas/__init__.py | 2 ++ pandas/_libs/__init__.py | 2 ++ pandas/core/api.py | 2 ++ pandas/tests/api/test_api.py | 1 + pandas/tests/tslibs/test_strftime.py | 2 +- 5 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 1a549c09d22f7..6edc7b1013f5c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -92,6 +92,7 @@ Interval, interval_range, DateOffset, + convert_strftime_format, # conversion to_numeric, to_datetime, @@ -277,6 +278,7 @@ "arrays", "bdate_range", "concat", + "convert_strftime_format", "crosstab", "cut", "date_range", diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index f119e280f5867..97236c62becad 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -5,6 +5,7 @@ "Period", "Timedelta", "Timestamp", + "convert_strftime_format", "iNaT", "Interval", ] @@ -18,5 +19,6 @@ Period, Timedelta, Timestamp, + convert_strftime_format, iNaT, ) diff --git a/pandas/core/api.py b/pandas/core/api.py index c0b828d9330b4..77a02e75b2b82 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -1,4 +1,5 @@ from pandas._libs import ( + convert_strftime_format, NaT, Period, Timedelta, @@ -116,6 +117,7 @@ "NaT", "notna", "notnull", + "convert_strftime_format", "Period", "PeriodDtype", "PeriodIndex", diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index e448e1bce9146..0227ccd4263ac 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -100,6 +100,7 @@ class TestPDApi(Base): "array", "bdate_range", "concat", + "convert_strftime_format", "crosstab", "cut", "date_range", diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index c969fec508e89..1341ff01cd904 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -7,7 +7,7 @@ import pytest -from pandas._libs.tslibs import convert_strftime_format +from pandas import convert_strftime_format from pandas._libs.tslibs.strftime import ( UnsupportedStrFmtDirective, get_current_locale_specific_string, From bf170ba5d4e2f1b0872daea556135cbe201bd5b4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Feb 2023 11:40:17 +0100 Subject: [PATCH 027/115] Updated whats new 1.5.4 --- doc/source/whatsnew/v1.5.4.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v1.5.4.rst b/doc/source/whatsnew/v1.5.4.rst index 0d91424eb65ac..4dea71e4d8a1b 100644 --- a/doc/source/whatsnew/v1.5.4.rst +++ b/doc/source/whatsnew/v1.5.4.rst @@ -8,6 +8,30 @@ including other versions of pandas. {{ header }} +.. --------------------------------------------------------------------------- +.. _whatsnew_154.strftime_perf: + +Strftime performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- :class:`Period`'s default formatter (`period_format`) is now much faster. This + improves performance of `str(Period)`, `repr(Period)`, and + :meth:`Period.strftime(fmt=None)`. +- New :func:`convert_strftime_format` to convert a strftime formatting template into + a python string formatting template. +- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging + templates created with :func:`convert_strftime_format` +- New `fast_strftime` boolean flag in all formatting procedures to enable faster + strftime operations leveraging :func:`convert_strftime_format` and python string + formatting: + - in :meth:`DatetimeLikeArrayMixin.strftime` and + :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, + :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit + from the improvement. :class:`TimedeltaArray.strftime` and + :class:`TimedeltaArray.format` are not impacted as their `date_format` + argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and + :class:`CSVFormatter` + .. --------------------------------------------------------------------------- .. _whatsnew_154.regressions: From 4b3463927d8add0497973017c2db5c23d5af8c1b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 9 May 2023 12:59:17 +0200 Subject: [PATCH 028/115] Fixed issues following the merge from latest main. Introduced new function `fast_period_format` --- pandas/_libs/tslibs/period.pyi | 1 + pandas/_libs/tslibs/period.pyx | 113 +++++++++++++++++++++------------ pandas/core/arrays/period.py | 4 +- pandas/core/indexes/base.py | 1 + 4 files changed, 76 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 31cfcde654396..c0f7e3da248cc 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,6 +48,7 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, + fast_strftime: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 0fe9fa73ca51d..b53e642af7346 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -54,6 +54,10 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct_to_datetime, pandas_datetime_to_datetimestruct, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) import_pandas_datetime() @@ -1232,6 +1236,43 @@ cdef str period_format(int64_t value, int freq, object fmt=None): return _period_strftime(value, freq, fmt, dts) +cdef str fast_period_format(int64_t value, int freq, object fmt_str, object loc_s): + """ + A faster alternative to `strftime` using string formatting. + `fmt_str` and `loc_s` should be created beforehand using + `convert_strftime_format(fmt, target="period")`. + """ + cdef: + npy_datetimestruct dts, dts2 + int quarter, y, h + + # Fill dts with all fields + get_date_info(value, freq, &dts) + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template + y = dts.year + h = dts.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } + cdef list extra_fmts = [(b"%q", b"^`AB`^"), (b"%f", b"^`CD`^"), (b"%F", b"^`EF`^"), @@ -1304,7 +1345,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt, npy_datetimestruct def period_array_strftime( - ndarray values, int dtype_code, object na_rep, str date_format + ndarray values, int dtype_code, object na_rep, str date_format, bint fast_strftime ): """ Vectorized Period.strftime used for PeriodArray._format_native_types. @@ -1316,6 +1357,9 @@ def period_array_strftime( Corresponds to PeriodDtype._dtype_code na_rep : any date_format : str or None + fast_strftime : bool + If `True` and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. """ cdef: Py_ssize_t i, n = values.size @@ -1326,6 +1370,18 @@ def period_array_strftime( ) object[::1] out_flat = out.ravel() cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, values) + object fmt_str, loc_s + + if fast_strftime: + if date_format is None: + fast_strftime = False + else: + try: + # Try to get the string formatting template for this format + fmt_str, loc_s = convert_strftime_format(date_format, target="period") + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False for i in range(n): # Analogous to: ordinal = values[i] @@ -1334,14 +1390,18 @@ def period_array_strftime( if ordinal == NPY_NAT: item_repr = na_rep else: - # This is equivalent to - # freq = frequency_corresponding_to_dtype_code(dtype_code) - # per = Period(ordinal, freq=freq) - # if date_format: - # item_repr = per.strftime(date_format) - # else: - # item_repr = str(per) - item_repr = period_format(ordinal, dtype_code, date_format) + if fast_strftime: + # Use python string formatting (faster than strftime) + item_repr = fast_period_format(ordinal, dtype_code, fmt_str, loc_s) + else: + # This is equivalent to + # freq = frequency_corresponding_to_dtype_code(dtype_code) + # per = Period(ordinal, freq=freq) + # if date_format: + # item_repr = per.strftime(date_format) + # else: + # item_repr = str(per) + item_repr = period_format(ordinal, dtype_code, date_format) # Analogous to: ordinals[i] = ordinal out_flat[i] = item_repr @@ -2449,42 +2509,13 @@ cdef class _Period(PeriodMixin): >>> a.fast_strftime(fast_fmt, loc_s) '2006-Q1' """ - freq = self._dtype._dtype_code value = self.ordinal if value == NPY_NAT: return "NaT" - - cdef: - npy_datetimestruct dts, dts2 - int quarter, y, h - - # Fill dts with all fields - get_date_info(value, freq, &dts) - - # Get the quarter and fiscal year - quarter = get_yq(value, freq, &dts2) - - # Finally use the string template - y = dts.year - h = dts.hour - return fmt_str % { - "year": y, - "shortyear": y % 100, - "month": dts.month, - "day": dts.day, - "hour": h, - "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": loc_s.pm if (h // 12) else loc_s.am, - "min": dts.min, - "sec": dts.sec, - "ms": dts.us // 1000, - "us": dts.us, - "ns": (dts.us * 1000) + (dts.ps // 1000), - "q": quarter, - "Fyear": dts2.year, - "fyear": dts2.year % 100, - } + else: + freq = self._dtype._dtype_code + return fast_period_format(value, freq, fmt_str, loc_s) def strftime(self, fmt: str) -> str: r""" diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 729a562e3ebfc..b9b984e34d330 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -640,14 +640,14 @@ def _format_native_types( *, na_rep: str | float = "NaT", date_format=None, - fast_strftime=True, + fast_strftime: bool = True, **kwargs, ) -> npt.NDArray[np.object_]: """ actually format my specific types """ return libperiod.period_array_strftime( - self.asi8, self.dtype._dtype_code, na_rep, date_format + self.asi8, self.dtype._dtype_code, na_rep, date_format, fast_strftime ) # ------------------------------------------------------------------ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a52630296c27..174756c88f270 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1378,6 +1378,7 @@ def _format_native_types( decimal: str_t = ".", float_format=None, date_format=None, + fast_strftime: bool = True, quoting=None, ) -> npt.NDArray[np.object_]: """ From 6e189add40139e84bd4b592f478cc8cd3a5a3689 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 9 May 2023 13:07:05 +0200 Subject: [PATCH 029/115] Improved ASV bench slightly (added datetime index formatting tests) --- asv_bench/benchmarks/strftime.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 3966078f0b135..183a4d669dfc2 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -23,15 +23,24 @@ def setup(self, nobs, tz_aware): self.data["dt"] = self.data["dt"].dt.tz_localize("UTC") self.data["d"] = self.data["d"].dt.tz_localize("UTC") + self.data["i"] = self.data["dt"] + self.data.set_index("i", inplace=True) + def time_frame_date_to_str(self, nobs, tz_aware): self.data["d"].astype(str) def time_frame_date_formatting_default(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format=None) + def time_frame_date_formatting_index_default(self, nobs, tz_aware): + self.data.index.format() + def time_frame_date_formatting_custom(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format="%Y---%m---%d") + def time_frame_date_formatting_index_custom(self, nobs, tz_aware): + self.data.index.format(date_format="%Y---%m---%d") + def time_frame_datetime_to_str(self, nobs, tz_aware): self.data["dt"].astype(str) From 9152dbcb3bae0877f2632314487cc03bc071e559 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 9 May 2023 13:10:47 +0200 Subject: [PATCH 030/115] Merged whatsnew --- doc/source/whatsnew/v2.1.0.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 258c14cec7925..3ed22d8e8b64c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -69,6 +69,26 @@ Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set This has been deprecated and will :meth:`Categorical.map` in the future change the default to ``na_action=None``, like for all the other array types. +.. _whatsnew_210.enhancements.enhancement3: + +Strftime performance improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- New :func:`convert_strftime_format` to convert a strftime formatting template into + a python string formatting template. +- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging + templates created with :func:`convert_strftime_format` +- New `fast_strftime` boolean flag in all formatting procedures to enable faster + strftime operations leveraging :func:`convert_strftime_format` and python string + formatting: + - in :meth:`DatetimeLikeArrayMixin.strftime` and + :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, + :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit + from the improvement. :class:`TimedeltaArray.strftime` and + :class:`TimedeltaArray.format` are not impacted as their `date_format` + argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and + :class:`CSVFormatter` + .. _whatsnew_210.enhancements.other: Other enhancements From 35a39a066c8e235e97522b28987004fa81fa30b0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 00:31:36 +0200 Subject: [PATCH 031/115] pre-commit and docstring checks --- pandas/_libs/tslibs/strftime.py | 4 ++-- pandas/core/api.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/period.py | 2 -- pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/tslibs/test_strftime.py | 2 +- 7 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 82e7039718bb7..e7a9942812e6d 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -92,11 +92,11 @@ class LocaleSpecificDtStrings: __slots__ = ("am", "pm") - def __init__(self, am: str, pm: str): + def __init__(self, am: str, pm: str) -> None: self.am = am self.pm = pm - def __repr__(self): + def __repr__(self) -> str: attrs = ", ".join( [f"{k}={repr(getattr(self, k))}" for k in type(self).__slots__] ) diff --git a/pandas/core/api.py b/pandas/core/api.py index 77a02e75b2b82..968990fe12e51 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -1,9 +1,9 @@ from pandas._libs import ( - convert_strftime_format, NaT, Period, Timedelta, Timestamp, + convert_strftime_format, ) from pandas._libs.missing import NA diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c06a782272105..4e215c972d4d1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -318,7 +318,7 @@ def asi8(self) -> npt.NDArray[np.int64]: # Rendering Methods def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime=True + self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime: bool=True ) -> npt.NDArray[np.object_]: """ Helper method for astype when converting to strings. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e7ebeaf137b76..c51e6253ded68 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -708,7 +708,7 @@ def _format_native_types( *, na_rep: str | float = "NaT", date_format=None, - fast_strftime=True, + fast_strftime: bool=True, **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b9b984e34d330..6d7262977d5eb 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -26,9 +26,7 @@ NaT, NaTType, Timedelta, - UnsupportedStrFmtDirective, astype_overflowsafe, - convert_strftime_format, dt64arr_to_periodarr as c_dt64arr_to_periodarr, get_unit_from_dtype, iNaT, diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 5bce21efbea84..72abb4a8da632 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -460,7 +460,7 @@ def _format_native_types( *, na_rep: str | float = "NaT", date_format=None, - fast_strftime=True, + fast_strftime: bool=True, **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 1341ff01cd904..5e4bf66521dd7 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -7,12 +7,12 @@ import pytest -from pandas import convert_strftime_format from pandas._libs.tslibs.strftime import ( UnsupportedStrFmtDirective, get_current_locale_specific_string, ) +from pandas import convert_strftime_format import pandas._testing as tm From d13895bce4f3d5a84c41c1f88f812838ab55d036 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 13:47:01 +0200 Subject: [PATCH 032/115] blackened --- pandas/core/arrays/datetimelike.py | 6 +++++- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4e215c972d4d1..7ce10f571843f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -318,7 +318,11 @@ def asi8(self) -> npt.NDArray[np.int64]: # Rendering Methods def _format_native_types( - self, *, na_rep: str | float = "NaT", date_format=None, fast_strftime: bool=True + self, + *, + na_rep: str | float = "NaT", + date_format=None, + fast_strftime: bool = True, ) -> npt.NDArray[np.object_]: """ Helper method for astype when converting to strings. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index c51e6253ded68..9ea7112ecf3a6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -708,7 +708,7 @@ def _format_native_types( *, na_rep: str | float = "NaT", date_format=None, - fast_strftime: bool=True, + fast_strftime: bool = True, **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 72abb4a8da632..7a1e39c68bb0e 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -460,7 +460,7 @@ def _format_native_types( *, na_rep: str | float = "NaT", date_format=None, - fast_strftime: bool=True, + fast_strftime: bool = True, **kwargs, ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 From 4007330f2c42a45c485f688b74d0b091b81f41bb Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 14:09:16 +0200 Subject: [PATCH 033/115] Added two asvs --- asv_bench/benchmarks/io/json.py | 3 +++ asv_bench/benchmarks/strftime.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 9eaffddd8b87f..44072c7d10e18 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -207,6 +207,9 @@ def setup(self, orient): def time_iso_format(self, orient): self.df.to_json(orient=orient, date_format="iso") + def time_custom_format(self, orient): + self.df.to_json(orient=orient, date_format="%Y-%m-%d__%H:%M:%S") + class ToJSONLines(BaseIO): fname = "__test__.json" diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 183a4d669dfc2..44f00e9e2ed77 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -103,6 +103,12 @@ def setup(self, nobs, freq): def time_frame_period_to_str(self, nobs, freq): self.data["p"].astype(str) + def time_frame_period_str(self, nobs): + self.data["p"].apply(str) + + def time_frame_period_repr(self, nobs): + self.data["p"].apply(repr) + def time_frame_period_formatting_default(self, nobs, freq): self.data["p"].dt.strftime(date_format=None) @@ -118,6 +124,9 @@ def time_frame_period_formatting_index_default_explicit(self, nobs, freq): def time_frame_period_formatting_custom(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + # def time_frame_period_formatting_iso8601_map(self, obs, fq): + # self.data["p"].map(lambda p: p.isoformat()) + def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") From 3d8d46967d30943c0f7e0dffbbd77b014d6c63e7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 15:42:24 +0200 Subject: [PATCH 034/115] Fixed mypy error --- pandas/_libs/tslibs/strftime.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index e7a9942812e6d..2d3cdcd2bc753 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -212,6 +212,7 @@ def convert_strftime_format( distribution, as well as specific additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``). """ + unsupported: Tuple[Tuple[str, ...], ...] if target in ("datetime", "date", "time"): directive_maps = (_COMMON_MAP, _DATETIME_MAP) unsupported = (_COMMON_UNSUPPORTED, _DATETIME_UNSUPPORTED) From 1fc3d48b4290d1d8c0042a4ecff462d859aad12b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 15:45:39 +0200 Subject: [PATCH 035/115] Fixed ASV bench --- asv_bench/benchmarks/strftime.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 44f00e9e2ed77..c889970b48308 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -62,20 +62,20 @@ def time_frame_datetime_formatting_index_default(self, nobs, tz_aware): def time_frame_datetime_formatting_custom(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") - def time_frame_datetime_formatting_index_custom(self, obs, tz_aware): + def time_frame_datetime_formatting_index_custom(self, nobs, tz_aware): self.data.set_index("dt").index.format(date_format="%Y-%m-%d --- %H:%M:%S") - def time_frame_datetime_formatting_iso8601_map(self, obs, tz_aware): + def time_frame_datetime_formatting_iso8601_map(self, nobs, tz_aware): self.data["dt"].map(lambda timestamp: timestamp.isoformat()) - def time_frame_datetime_formatting_iso8601_strftime_Z(self, obs, tz_aware): + def time_frame_datetime_formatting_iso8601_strftime_Z(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") - def time_frame_datetime_formatting_iso8601_strftime_offset(self, obs, tz_aware): + def time_frame_datetime_formatting_iso8601_strftime_offset(self, nobs, tz_aware): """Not optimized yet as %z is not supported by `convert_strftime_format`""" self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") - # def time_frame_datetime_formatting_iso8601_isoformat(self, obs, tz_aware): + # def time_frame_datetime_formatting_iso8601_isoformat(self, nobs, tz_aware): # TODO this PR is probably a good opportunity to add this too, or maybe # another PR # self.data["dt"].dt.isoformat() @@ -103,10 +103,10 @@ def setup(self, nobs, freq): def time_frame_period_to_str(self, nobs, freq): self.data["p"].astype(str) - def time_frame_period_str(self, nobs): + def time_frame_period_str(self, nobs, freq): self.data["p"].apply(str) - def time_frame_period_repr(self, nobs): + def time_frame_period_repr(self, nobs, freq): self.data["p"].apply(repr) def time_frame_period_formatting_default(self, nobs, freq): @@ -124,7 +124,9 @@ def time_frame_period_formatting_index_default_explicit(self, nobs, freq): def time_frame_period_formatting_custom(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") - # def time_frame_period_formatting_iso8601_map(self, obs, fq): + # def time_frame_period_formatting_iso8601_map(self, nobs, fq): + # TODO this PR is probably a good opportunity to add this too, or maybe + # another PR # self.data["p"].map(lambda p: p.isoformat()) def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq): From 69289412f2061b1181f810669a3afe77df3e3bd1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 10 May 2023 16:16:33 +0200 Subject: [PATCH 036/115] Improved ASV benchs --- asv_bench/benchmarks/io/csv.py | 34 ++++++++++++++++++++++++--------- asv_bench/benchmarks/io/json.py | 5 +++-- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 86a983d3deb62..3ee9a3cdd0b5b 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -81,7 +81,13 @@ def setup(self): rng = date_range("1/1/2000", periods=1000) self.data = DataFrame(rng, index=rng) - def time_frame_date_formatting(self): + def time_frame_date_formatting_default(self): + self.data.to_csv(self.fname) + + def time_frame_date_formatting_default_explicit(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d") + + def time_frame_date_formatting_custom(self): self.data.to_csv(self.fname, date_format="%Y%m%d") @@ -92,11 +98,14 @@ def setup(self): rng = date_range("2000", periods=100_000, freq="S") self.data = DataFrame({"a": 1}, index=rng) - def time_frame_date_formatting_index(self): + def time_frame_date_formatting_index_default(self): + self.data.to_csv(self.fname) + + def time_frame_date_formatting_index_default_explicit(self): self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S") - def time_frame_date_no_format_index(self): - self.data.to_csv(self.fname) + def time_frame_date_formatting_index_custom(self): + self.data.to_csv(self.fname, date_format="%Y-%m-%d__%H:%M:%S") class ToCSVPeriod(BaseIO): @@ -119,7 +128,7 @@ def time_frame_period_formatting_default(self, nobs, freq): def time_frame_period_formatting_default_explicit(self, nobs, freq): self.data.to_csv(self.fname, date_format=self.default_fmt) - def time_frame_period_formatting(self, nobs, freq): + def time_frame_period_formatting_custom(self, nobs, freq): # Nb: `date_format` is not actually taken into account here today, so the # performance is currently identical to `time_frame_period_formatting_default` # above. This timer is therefore expected to degrade when GH#51621 is fixed. @@ -141,15 +150,19 @@ def setup(self, nobs, freq): elif freq == "H": self.default_fmt = "%Y-%m-%d %H:00" - def time_frame_period_formatting_index(self, nobs, freq): - self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") - def time_frame_period_formatting_index_default(self, nobs, freq): self.data.to_csv(self.fname) def time_frame_period_formatting_index_default_explicit(self, nobs, freq): self.data.to_csv(self.fname, date_format=self.default_fmt) + def time_frame_period_formatting_index_custom(self, nobs, freq): + # Nb: `date_format` is not actually taken into account here today, so the + # performance is currently identical to `time_frame_period_formatting_default` + # above. This timer is therefore expected to degrade when GH#51621 is fixed. + # (Remove this comment when GH#51621 is fixed.) + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" @@ -168,9 +181,12 @@ def setup(self, nobs): } ) - def time_frame(self, nobs): + def time_frame_formatting_default(self, nobs): self.data.to_csv(self.fname) + def time_frame_date_formatting_custom(self): + self.data.to_csv(self.fname, date_format="%Y%m%d__%H%M%S") + class ToCSVIndexes(BaseIO): fname = "__test__.csv" diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 44072c7d10e18..8e008b6904df4 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -207,8 +207,9 @@ def setup(self, orient): def time_iso_format(self, orient): self.df.to_json(orient=orient, date_format="iso") - def time_custom_format(self, orient): - self.df.to_json(orient=orient, date_format="%Y-%m-%d__%H:%M:%S") + # Providing a custom `date_format` is not possible today, this test is pointless + # def time_custom_format(self, orient): + # self.df.to_json(orient=orient, date_format="%Y-%m-%d__%H:%M:%S") class ToJSONLines(BaseIO): From 7c87a2fe329df31c0c52ab18982e7f0349595cfa Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 May 2023 10:17:04 +0200 Subject: [PATCH 037/115] Fixed RST format in whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index bccdca35b2513..834e58f0626d8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -77,7 +77,7 @@ Strftime performance improvements a python string formatting template. - New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` -- New `fast_strftime` boolean flag in all formatting procedures to enable faster +- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: - in :meth:`DatetimeLikeArrayMixin.strftime` and From 21b7c9dda0b12d4e96c476a84cedc67b5c327f7e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 May 2023 10:20:15 +0200 Subject: [PATCH 038/115] Hopefully fixed the ASV bench for the case when the format is the default one but provided in an explicit way --- pandas/_libs/tslibs/period.pyx | 164 ++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 64 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 0fe64dd05dd98..719eaf036c71f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1159,11 +1159,20 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: return npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts) -cdef str period_format(int64_t value, int freq, object fmt=None): +cdef str period_format( + int64_t value, + int freq, + object fmt=None, + object fast_fmt=None, + object fast_loc_am=None, + object fast_loc_pm=None, + bint preencoded_str=False +): + """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" cdef: int freq_group, quarter - npy_datetimestruct dts + npy_datetimestruct dts, dts2 bint is_fmt_none if value == NPY_NAT: @@ -1228,52 +1237,51 @@ cdef str period_format(int64_t value, int freq, object fmt=None): # `freq_group` is invalid, raise raise ValueError(f"Unknown freq: {freq}") + elif fast_fmt is not None: + # A custom format is requested using python string formatting + + if not preencoded_str: + # Encode strings using current locale, in case they contain non-utf8 chars + if isinstance(fast_fmt, str): + fast_fmt = util.string_encode_locale(fast_fmt) + if isinstance(fast_loc_am, str): + fast_loc_am = util.string_encode_locale(fast_loc_am) + if isinstance(fast_loc_pm, str): + fast_loc_pm = util.string_encode_locale(fast_loc_pm) + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template + y = dts.year + h = dts.hour + return fast_fmt % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": fast_loc_pm if (h // 12) else fast_loc_am, + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } else: - # A custom format is requested - if isinstance(fmt, str): + # A custom format is requested using strftime (slower) + + if not preencoded_str and isinstance(fmt, str): # Encode using current locale, in case fmt contains non-utf8 chars fmt = util.string_encode_locale(fmt) return _period_strftime(value, freq, fmt, dts) -cdef str fast_period_format(int64_t value, int freq, object fmt_str, object loc_s): - """ - A faster alternative to `strftime` using string formatting. - `fmt_str` and `loc_s` should be created beforehand using - `convert_strftime_format(fmt, target="period")`. - """ - cdef: - npy_datetimestruct dts, dts2 - int quarter, y, h - - # Fill dts with all fields - get_date_info(value, freq, &dts) - - # Get the quarter and fiscal year - quarter = get_yq(value, freq, &dts2) - - # Finally use the string template - y = dts.year - h = dts.hour - return fmt_str % { - "year": y, - "shortyear": y % 100, - "month": dts.month, - "day": dts.day, - "hour": h, - "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": loc_s.pm if (h // 12) else loc_s.am, - "min": dts.min, - "sec": dts.sec, - "ms": dts.us // 1000, - "us": dts.us, - "ns": (dts.us * 1000) + (dts.ps // 1000), - "q": quarter, - "Fyear": dts2.year, - "fyear": dts2.year % 100, - } - cdef list extra_fmts = [(b"%q", b"^`AB`^"), (b"%f", b"^`CD`^"), (b"%F", b"^`EF`^"), @@ -1371,18 +1379,33 @@ def period_array_strftime( ) object[::1] out_flat = out.ravel() cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, values) - object fmt_str, loc_s + object date_fmt_bytes + object fast_fmt = None + object fast_loc + object fast_loc_am + object fast_loc_pm - if fast_strftime: - if date_format is None: - fast_strftime = False + if fast_strftime and date_format is not None: + try: + # Try to get the string formatting template for this format + fast_fmt, fast_loc = convert_strftime_format(date_format, target="period") + fast_loc_am = fast_loc.am + fast_loc_pm = fast_loc.pm + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + pass else: - try: - # Try to get the string formatting template for this format - fmt_str, loc_s = convert_strftime_format(date_format, target="period") - except UnsupportedStrFmtDirective: - # Unsupported directive: fallback to standard `strftime` - fast_strftime = False + # Encode strings using current locale, in case they contain non-utf8 chars + if isinstance(fast_fmt, str): + fast_fmt = util.string_encode_locale(fast_fmt) + if isinstance(fast_loc_am, str): + fast_loc_am = util.string_encode_locale(fast_loc_am) + if isinstance(fast_loc_pm, str): + fast_loc_pm = util.string_encode_locale(fast_loc_pm) + + if fast_fmt is None: + # Encode string using current locale, in case it contains non-utf8 chars + date_fmt_bytes = util.string_encode_locale(date_format) for i in range(n): # Analogous to: ordinal = values[i] @@ -1391,18 +1414,24 @@ def period_array_strftime( if ordinal == NPY_NAT: item_repr = na_rep else: - if fast_strftime: - # Use python string formatting (faster than strftime) - item_repr = fast_period_format(ordinal, dtype_code, fmt_str, loc_s) - else: - # This is equivalent to - # freq = frequency_corresponding_to_dtype_code(dtype_code) - # per = Period(ordinal, freq=freq) - # if date_format: - # item_repr = per.strftime(date_format) - # else: - # item_repr = str(per) - item_repr = period_format(ordinal, dtype_code, date_format) + # This is equivalent to + # freq = frequency_corresponding_to_dtype_code(dtype_code) + # per = Period(ordinal, freq=freq) + # if fast_fmt: + # item_repr = per.fast_strftime(fast_fmt, fast_loc) + # elif date_format: + # item_repr = per.strftime(date_format) + # else: + # item_repr = str(per) + item_repr = period_format( + ordinal, + dtype_code, + date_format, + fast_fmt, + fast_loc_am, + fast_loc_pm, + True, + ) # Analogous to: ordinals[i] = ordinal out_flat[i] = item_repr @@ -2516,7 +2545,14 @@ cdef class _Period(PeriodMixin): return "NaT" else: freq = self._dtype._dtype_code - return fast_period_format(value, freq, fmt_str, loc_s) + return period_format( + value, + freq, + "dummy" if fmt_str is not None else None, + fmt_str, + loc_s.am, + loc_s.pm + ) def strftime(self, fmt: str) -> str: r""" From fdb7309671ec3b62c8eaca8c18a3b20e7eb89c2b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 May 2023 10:35:28 +0200 Subject: [PATCH 039/115] Fixed issue in period.pyx --- pandas/_libs/tslibs/period.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 719eaf036c71f..1e0712163fdba 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1426,7 +1426,7 @@ def period_array_strftime( item_repr = period_format( ordinal, dtype_code, - date_format, + date_fmt_bytes, fast_fmt, fast_loc_am, fast_loc_pm, From 636f27f4efafa1c2ca3019e14b4827e66353402d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 May 2023 10:40:06 +0200 Subject: [PATCH 040/115] Made hooks happy --- pandas/_libs/tslibs/strftime.py | 4 +++- pandas/tests/tslibs/test_strftime.py | 24 +++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 2d3cdcd2bc753..031aa64616299 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -201,7 +201,9 @@ def convert_strftime_format( See Also -------- - `strftime format codes reference `_ # noqa + `strftime format codes reference + `_ `Stackoverflow post `_ explaining how old-style formatting is faster than new-style formatting, diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 5e4bf66521dd7..a075fb6f67f1b 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -65,13 +65,15 @@ class TestConvertStrftimeFormat: ), ( "20%y-%m-%d__foo__%I:%M:%S%p", - "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", - "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__" + "%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__" + "{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", ), ), ) def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): - """Test that `convert_strftime_format` returns the correct formatting template""" + """Test that `convert_strftime_format` returns the correct template""" str_tmp, loc_s = convert_strftime_format( strftime_fmt, target="datetime", new_style_fmt=False ) @@ -94,18 +96,22 @@ def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): ), ( "%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s (ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", - "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} (ms={ms:03d} us={us:06d} ns={ns:09d})", + "%(shortyear)02d %(hour12)02d:%(min)02d:%(sec)02d%(ampm)s " + "(ms=%(ms)03d us=%(us)06d ns=%(ns)09d)", + "{shortyear:02d} {hour12:02d}:{min:02d}:{sec:02d}{ampm:s} " + "(ms={ms:03d} us={us:06d} ns={ns:09d})", ), ( "20%y-%m-%d__foo__%I:%M:%S%p", - "20%(shortyear)02d-%(month)02d-%(day)02d__foo__%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", - "20{shortyear:02d}-{month:02d}-{day:02d}__foo__{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + "20%(shortyear)02d-%(month)02d-%(day)02d__foo__" + "%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__foo__" + "{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", ), ), ) def test_format_period(self, strftime_fmt, res_fmt_old, res_fmt_new): - """Test that `convert_strftime_format` returns the correct formatting template""" + """Test that `convert_strftime_format` returns the correct template""" str_tmp, loc_s = convert_strftime_format( strftime_fmt, target="period", new_style_fmt=False ) @@ -129,7 +135,7 @@ def test_format_period(self, strftime_fmt, res_fmt_old, res_fmt_new): ) @pytest.mark.parametrize("target", ("datetime", "date", "time", "period")) def test_format_non_ascii(self, locale_str, target): - """Test that `convert_strftime_format` is robust to locale and format encoding""" + """Test that `convert_strftime_format` is robust to locale and fmt encoding""" # Skip if locale cannot be set if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): From 3b96d0b397fe11230f862f9f2728e97557003054 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 11 May 2023 12:48:04 +0200 Subject: [PATCH 041/115] Fixed variables used before initialization --- pandas/_libs/tslibs/period.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 1e0712163fdba..75d1bd940f5b4 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1379,11 +1379,11 @@ def period_array_strftime( ) object[::1] out_flat = out.ravel() cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, values) - object date_fmt_bytes + object date_fmt_bytes = None object fast_fmt = None object fast_loc - object fast_loc_am - object fast_loc_pm + object fast_loc_am = None + object fast_loc_pm = None if fast_strftime and date_format is not None: try: From 3433c16a5bc724e7feed032d50be2e69b9d90605 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 12 May 2023 13:33:22 +0200 Subject: [PATCH 042/115] pre-commit hook upgrade --- pandas/_libs/tslibs/strftime.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 031aa64616299..0db0d1b849029 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -1,11 +1,9 @@ """Strftime-related classes and functions. """ +from __future__ import annotations + from datetime import time import locale -from typing import ( - Dict, - Tuple, -) class UnsupportedStrFmtDirective(ValueError): @@ -110,7 +108,7 @@ def get_current(cls): ) -_locale_specifics: Dict[str, LocaleSpecificDtStrings] = {} +_locale_specifics: dict[str, LocaleSpecificDtStrings] = {} def get_current_locale_specific_string() -> LocaleSpecificDtStrings: @@ -136,7 +134,7 @@ def convert_strftime_format( strftime_fmt: str, target: str, new_style_fmt: bool = False, -) -> Tuple[str, LocaleSpecificDtStrings]: +) -> tuple[str, LocaleSpecificDtStrings]: """Convert a strftime formatting string into a formatting template string. The set of supported directives varies according to the `target`. @@ -214,7 +212,7 @@ def convert_strftime_format( distribution, as well as specific additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``). """ - unsupported: Tuple[Tuple[str, ...], ...] + unsupported: tuple[tuple[str, ...], ...] if target in ("datetime", "date", "time"): directive_maps = (_COMMON_MAP, _DATETIME_MAP) unsupported = (_COMMON_UNSUPPORTED, _DATETIME_UNSUPPORTED) From 227f188e6f69b90d08b17d7fd075bcc0810ea26a Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 12 May 2023 21:35:35 +0200 Subject: [PATCH 043/115] Fixed error in period.pyx --- pandas/_libs/tslibs/period.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 75d1bd940f5b4..12b1d4e753a46 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1379,12 +1379,19 @@ def period_array_strftime( ) object[::1] out_flat = out.ravel() cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, values) - object date_fmt_bytes = None + object date_fmt_bytes object fast_fmt = None object fast_loc object fast_loc_am = None object fast_loc_pm = None + if isinstance(date_format, str): + # Encode string using current locale, in case it contains non-utf8 chars + date_fmt_bytes = util.string_encode_locale(date_format) + else: + # None or bytes already + date_fmt_bytes = date_format + if fast_strftime and date_format is not None: try: # Try to get the string formatting template for this format @@ -1403,10 +1410,6 @@ def period_array_strftime( if isinstance(fast_loc_pm, str): fast_loc_pm = util.string_encode_locale(fast_loc_pm) - if fast_fmt is None: - # Encode string using current locale, in case it contains non-utf8 chars - date_fmt_bytes = util.string_encode_locale(date_format) - for i in range(n): # Analogous to: ordinal = values[i] ordinal = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] From 9c03fe4b7da33c5c4272e6b081e0bcaf07c43040 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 14:57:20 +0200 Subject: [PATCH 044/115] Fixed issue: there was no need to encode to bytes to apply string formatting --- pandas/_libs/tslibs/period.pyx | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 12b1d4e753a46..e70217da05194 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1166,7 +1166,6 @@ cdef str period_format( object fast_fmt=None, object fast_loc_am=None, object fast_loc_pm=None, - bint preencoded_str=False ): """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" @@ -1240,19 +1239,11 @@ cdef str period_format( elif fast_fmt is not None: # A custom format is requested using python string formatting - if not preencoded_str: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) - # Get the quarter and fiscal year quarter = get_yq(value, freq, &dts2) - # Finally use the string template + # Finally use the string template. Note: handling of non-utf8 chars is directly + # done in python here, no need to encode as for c-strftime y = dts.year h = dts.hour return fast_fmt % { @@ -1275,7 +1266,7 @@ cdef str period_format( else: # A custom format is requested using strftime (slower) - if not preencoded_str and isinstance(fmt, str): + if not isinstance(fmt, str): # Encode using current locale, in case fmt contains non-utf8 chars fmt = util.string_encode_locale(fmt) @@ -1401,14 +1392,6 @@ def period_array_strftime( except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass - else: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) for i in range(n): # Analogous to: ordinal = values[i] @@ -1433,7 +1416,6 @@ def period_array_strftime( fast_fmt, fast_loc_am, fast_loc_pm, - True, ) # Analogous to: ordinals[i] = ordinal From d44a4c952dafa59762d915b6a2f464e9aa485f27 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 14:57:20 +0200 Subject: [PATCH 045/115] Fixed issue: there was no need to encode to bytes to apply string formatting --- pandas/_libs/tslibs/period.pyx | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 12b1d4e753a46..74652565a53fd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1166,7 +1166,6 @@ cdef str period_format( object fast_fmt=None, object fast_loc_am=None, object fast_loc_pm=None, - bint preencoded_str=False ): """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" @@ -1240,19 +1239,11 @@ cdef str period_format( elif fast_fmt is not None: # A custom format is requested using python string formatting - if not preencoded_str: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) - # Get the quarter and fiscal year quarter = get_yq(value, freq, &dts2) - # Finally use the string template + # Finally use the string template. Note: handling of non-utf8 chars is directly + # done in python here, no need to encode as for c-strftime y = dts.year h = dts.hour return fast_fmt % { @@ -1275,7 +1266,7 @@ cdef str period_format( else: # A custom format is requested using strftime (slower) - if not preencoded_str and isinstance(fmt, str): + if isinstance(fmt, str): # Encode using current locale, in case fmt contains non-utf8 chars fmt = util.string_encode_locale(fmt) @@ -1401,14 +1392,6 @@ def period_array_strftime( except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass - else: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) for i in range(n): # Analogous to: ordinal = values[i] @@ -1433,7 +1416,6 @@ def period_array_strftime( fast_fmt, fast_loc_am, fast_loc_pm, - True, ) # Analogous to: ordinals[i] = ordinal From f9565ff72416ade61b3fafb676784ab1d2a9d9ce Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 15:00:42 +0200 Subject: [PATCH 046/115] Revert "Fixed issue: there was no need to encode to bytes to apply string formatting" This reverts commit d44a4c952dafa59762d915b6a2f464e9aa485f27. --- pandas/_libs/tslibs/period.pyx | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 74652565a53fd..12b1d4e753a46 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1166,6 +1166,7 @@ cdef str period_format( object fast_fmt=None, object fast_loc_am=None, object fast_loc_pm=None, + bint preencoded_str=False ): """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" @@ -1239,11 +1240,19 @@ cdef str period_format( elif fast_fmt is not None: # A custom format is requested using python string formatting + if not preencoded_str: + # Encode strings using current locale, in case they contain non-utf8 chars + if isinstance(fast_fmt, str): + fast_fmt = util.string_encode_locale(fast_fmt) + if isinstance(fast_loc_am, str): + fast_loc_am = util.string_encode_locale(fast_loc_am) + if isinstance(fast_loc_pm, str): + fast_loc_pm = util.string_encode_locale(fast_loc_pm) + # Get the quarter and fiscal year quarter = get_yq(value, freq, &dts2) - # Finally use the string template. Note: handling of non-utf8 chars is directly - # done in python here, no need to encode as for c-strftime + # Finally use the string template y = dts.year h = dts.hour return fast_fmt % { @@ -1266,7 +1275,7 @@ cdef str period_format( else: # A custom format is requested using strftime (slower) - if isinstance(fmt, str): + if not preencoded_str and isinstance(fmt, str): # Encode using current locale, in case fmt contains non-utf8 chars fmt = util.string_encode_locale(fmt) @@ -1392,6 +1401,14 @@ def period_array_strftime( except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass + else: + # Encode strings using current locale, in case they contain non-utf8 chars + if isinstance(fast_fmt, str): + fast_fmt = util.string_encode_locale(fast_fmt) + if isinstance(fast_loc_am, str): + fast_loc_am = util.string_encode_locale(fast_loc_am) + if isinstance(fast_loc_pm, str): + fast_loc_pm = util.string_encode_locale(fast_loc_pm) for i in range(n): # Analogous to: ordinal = values[i] @@ -1416,6 +1433,7 @@ def period_array_strftime( fast_fmt, fast_loc_am, fast_loc_pm, + True, ) # Analogous to: ordinals[i] = ordinal From 3a87c64950300b1aefa023aa048361fa9e7d4fb5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 15:01:50 +0200 Subject: [PATCH 047/115] Fixed issue: there was no need to encode to bytes to apply string formatting --- pandas/_libs/tslibs/period.pyx | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 12b1d4e753a46..74652565a53fd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1166,7 +1166,6 @@ cdef str period_format( object fast_fmt=None, object fast_loc_am=None, object fast_loc_pm=None, - bint preencoded_str=False ): """Important: please provide a dummy non-None fmt if fast_fmt is non-None""" @@ -1240,19 +1239,11 @@ cdef str period_format( elif fast_fmt is not None: # A custom format is requested using python string formatting - if not preencoded_str: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) - # Get the quarter and fiscal year quarter = get_yq(value, freq, &dts2) - # Finally use the string template + # Finally use the string template. Note: handling of non-utf8 chars is directly + # done in python here, no need to encode as for c-strftime y = dts.year h = dts.hour return fast_fmt % { @@ -1275,7 +1266,7 @@ cdef str period_format( else: # A custom format is requested using strftime (slower) - if not preencoded_str and isinstance(fmt, str): + if isinstance(fmt, str): # Encode using current locale, in case fmt contains non-utf8 chars fmt = util.string_encode_locale(fmt) @@ -1401,14 +1392,6 @@ def period_array_strftime( except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` pass - else: - # Encode strings using current locale, in case they contain non-utf8 chars - if isinstance(fast_fmt, str): - fast_fmt = util.string_encode_locale(fast_fmt) - if isinstance(fast_loc_am, str): - fast_loc_am = util.string_encode_locale(fast_loc_am) - if isinstance(fast_loc_pm, str): - fast_loc_pm = util.string_encode_locale(fast_loc_pm) for i in range(n): # Analogous to: ordinal = values[i] @@ -1433,7 +1416,6 @@ def period_array_strftime( fast_fmt, fast_loc_am, fast_loc_pm, - True, ) # Analogous to: ordinals[i] = ordinal From bc5cb66669aadcf67b3d5b79bebb5330f1e3790c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 15:09:52 +0200 Subject: [PATCH 048/115] Removed todo in timedeltas.pyx --- pandas/_libs/tslibs/timedeltas.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 7b67e68557521..518a79c7c281a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1521,7 +1521,6 @@ cdef class _Timedelta(timedelta): comp_dict = self.components._asdict() comp_dict["sign"] = sign - # TODO make marginally faster using old-style python formatting ? return fmt.format(**comp_dict) def __repr__(self) -> str: From 812aa2a8bd0f6f011d7a4ec8eb1507fc49e55537 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 18:43:56 +0200 Subject: [PATCH 049/115] Fixed asv bench --- asv_bench/benchmarks/io/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 3ee9a3cdd0b5b..1606e85e5c43c 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -184,7 +184,7 @@ def setup(self, nobs): def time_frame_formatting_default(self, nobs): self.data.to_csv(self.fname) - def time_frame_date_formatting_custom(self): + def time_frame_date_formatting_custom(self, nobs): self.data.to_csv(self.fname, date_format="%Y%m%d__%H%M%S") From 69a9b6729a0c70b967a31c564f0a1e11cff7b52e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 21:03:14 +0200 Subject: [PATCH 050/115] Fixed ASV --- pandas/_libs/tslibs/period.pyx | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 74652565a53fd..77cdd1957a9ea 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1162,7 +1162,8 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef str period_format( int64_t value, int freq, - object fmt=None, + str fmt=None, + bytes fmt_bytes=None, object fast_fmt=None, object fast_loc_am=None, object fast_loc_pm=None, @@ -1266,11 +1267,12 @@ cdef str period_format( else: # A custom format is requested using strftime (slower) - if isinstance(fmt, str): - # Encode using current locale, in case fmt contains non-utf8 chars - fmt = util.string_encode_locale(fmt) + if fmt_bytes is None and isinstance(fmt, str): + # If not already done, + # encode using current locale, in case fmt contains non-utf8 chars + fmt_bytes = util.string_encode_locale(fmt) - return _period_strftime(value, freq, fmt, dts) + return _period_strftime(value, freq, fmt_bytes, dts) cdef list extra_fmts = [(b"%q", b"^`AB`^"), @@ -1412,6 +1414,7 @@ def period_array_strftime( item_repr = period_format( ordinal, dtype_code, + date_format, date_fmt_bytes, fast_fmt, fast_loc_am, @@ -2534,6 +2537,7 @@ cdef class _Period(PeriodMixin): value, freq, "dummy" if fmt_str is not None else None, + None, fmt_str, loc_s.am, loc_s.pm From abaf0cba08eed689fac7ffec83e681fbd64ca596 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 21:04:18 +0200 Subject: [PATCH 051/115] Fixed indentation issue --- doc/source/whatsnew/v2.1.0.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index bfa3ed006229f..c07e5aa819a76 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,9 +93,7 @@ Strftime performance improvements Other enhancements ^^^^^^^^^^^^^^^^^^ -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. - :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. - Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) From 537b9f73a3cd0eadb831c2156932e6e3489f0daf Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 14 May 2023 21:06:43 +0200 Subject: [PATCH 052/115] Doc fix whatsnew entries sorted --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a627a24a79cf3..0510f03895418 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,8 +93,8 @@ Strftime performance improvements Other enhancements ^^^^^^^^^^^^^^^^^^ -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) From f7307f44ab28fd1a84835b7174e7c1eed601ae30 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 15 May 2023 11:02:19 +0200 Subject: [PATCH 053/115] Improved speed of csv formatting of datetimeindex --- pandas/io/formats/csvs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 1175d426f0d59..c8877d96c9216 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -187,10 +187,10 @@ def data_index(self) -> Index: isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and self.date_format is not None ): - # TODO This branch seems unreachable, remove the if ? - data_index = Index( - [x.strftime(self.date_format) if notna(x) else "" for x in data_index] - ) + # Format and replace missings with empty string + data_index = data_index.strftime( + date_format=self.date_format, fast_strftime=self.fast_strftime + ).fillna("") elif isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index @@ -314,6 +314,8 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._mgr.to_native_types(**self._number_format) data = [res.iget_values(i) for i in range(len(res.items))] + # Format the index. Note that if `self.date_format` is not None the actual + # formatting is done beforehand, inside the `.data_index` property accessor. ix = self.data_index[slicer]._format_native_types(**self._number_format) libwriters.write_csv_rows( data, From f9e3b5f98114876ed9c35577c7b055a5303aef87 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 15 May 2023 11:02:43 +0200 Subject: [PATCH 054/115] pre-commit fixes --- pandas/io/formats/csvs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c8877d96c9216..83705841ecd1e 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -26,9 +26,6 @@ ABCMultiIndex, ABCPeriodIndex, ) -from pandas.core.dtypes.missing import notna - -from pandas.core.indexes.api import Index from pandas.io.common import get_handle @@ -42,6 +39,8 @@ WriteBuffer, ) + from pandas.core.indexes.api import Index + from pandas.io.formats.format import DataFrameFormatter From 9cf6d79c0abcd1f6b6463c832fcff703b263186b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sylvain=20Mari=C3=A9?= Date: Tue, 16 May 2023 11:23:49 +0200 Subject: [PATCH 055/115] Removed invalid comment --- asv_bench/benchmarks/io/csv.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 1606e85e5c43c..1f48f72679d72 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -157,10 +157,6 @@ def time_frame_period_formatting_index_default_explicit(self, nobs, freq): self.data.to_csv(self.fname, date_format=self.default_fmt) def time_frame_period_formatting_index_custom(self, nobs, freq): - # Nb: `date_format` is not actually taken into account here today, so the - # performance is currently identical to `time_frame_period_formatting_default` - # above. This timer is therefore expected to degrade when GH#51621 is fixed. - # (Remove this comment when GH#51621 is fixed.) self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") From 9e51d82657df849006ddd89a06abaff8bcfd13ec Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 31 May 2023 22:08:30 +0200 Subject: [PATCH 056/115] Fixed typo --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 83705841ecd1e..716c7ff93308c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -186,7 +186,7 @@ def data_index(self) -> Index: isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and self.date_format is not None ): - # Format and replace missings with empty string + # Format and replace missing entries with empty string data_index = data_index.strftime( date_format=self.date_format, fast_strftime=self.fast_strftime ).fillna("") From 8339062960a9e452d2c8a9e348c57a174d6bd3bf Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 1 Jun 2023 16:55:12 +0200 Subject: [PATCH 057/115] Fixed whatsnew --- doc/source/whatsnew/v2.1.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 714ff4b1e375b..ef546aea56c6b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,8 +93,10 @@ Strftime performance improvements Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. + :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. + Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) From 72df87c17081a2d221f932fc5c56c875fb038fb1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 5 Jun 2023 15:40:26 +0200 Subject: [PATCH 058/115] Fixed changelog indentation error: now a one-liner --- doc/source/whatsnew/v2.1.0.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ef546aea56c6b..c08666a33ee96 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,9 +93,7 @@ Strftime performance improvements Other enhancements ^^^^^^^^^^^^^^^^^^ -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. - :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. - Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) From ebb22e8eeed989ac56c4efc37f58e2caefcfa88f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 5 Jun 2023 15:50:11 +0200 Subject: [PATCH 059/115] Fixed mypy error --- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/period.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ec67334977d1a..87fe54ccd4c51 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -273,7 +273,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format, fast_strftime: bool = True) -> Index: + def strftime(self, date_format: str, fast_strftime: bool = True) -> Index: arr = self._data.strftime(date_format, fast_strftime=fast_strftime) return Index(arr, name=self.name, dtype=object) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f693f9557ecdc..21e13da183782 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -173,6 +173,11 @@ def _resolution_obj(self) -> Resolution: # methods that dispatch to array and wrap result in Index # These are defined here instead of via inherit_names for mypy + @doc(PeriodArray.strftime) + def strftime(self, date_format: str, fast_strftime: bool = True) -> Index: + arr = self._data.strftime(date_format, fast_strftime=fast_strftime) + return Index(arr, name=self.name, dtype=object) + @doc( PeriodArray.asfreq, other="pandas.arrays.PeriodArray", From a5869c4bf82999c6e7d06860f7347f1bc6f5ce4b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 5 Jun 2023 15:51:53 +0200 Subject: [PATCH 060/115] Fixed mypy error 2 --- pandas/core/indexes/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 21e13da183782..29df7a6e3ae9c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -76,7 +76,7 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, + ["start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) From 14c9cfe80a3c9fd867c5f0a48d7c446d5fc08ff6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 5 Jun 2023 15:58:13 +0200 Subject: [PATCH 061/115] Fixed whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 76a12cb5d2271..55a4bbecaa03d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -93,8 +93,8 @@ Strftime performance improvements Other enhancements ^^^^^^^^^^^^^^^^^^ -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) From e1ed22c66ad1d91fffecbd2eccce02b343c1bf72 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 6 Jun 2023 12:56:18 +0200 Subject: [PATCH 062/115] Fixed whatsnew ? --- doc/source/whatsnew/v2.1.0.rst | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 55a4bbecaa03d..f4dbe72d448b5 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -73,28 +73,20 @@ to ``na_action=None``, like for all the other array types. Strftime performance improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- New :func:`convert_strftime_format` to convert a strftime formatting template into - a python string formatting template. -- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging - templates created with :func:`convert_strftime_format` -- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster - strftime operations leveraging :func:`convert_strftime_format` and python string - formatting: - - in :meth:`DatetimeLikeArrayMixin.strftime` and - :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, - :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit - from the improvement. :class:`TimedeltaArray.strftime` and - :class:`TimedeltaArray.format` are not impacted as their `date_format` - argument is currently ignored. - - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and - :class:`CSVFormatter` +- New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. +- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` +- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: + - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their `date_format` argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` .. _whatsnew_210.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. + :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. + Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`) -- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter. :meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future. Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`) - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`) - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) From 6a502fc4248d0570e0bf2313093cf39e403dab42 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 6 Jun 2023 13:55:15 +0200 Subject: [PATCH 063/115] Fixed whatsnew ? --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f4dbe72d448b5..966a902c6c134 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -76,7 +76,7 @@ Strftime performance improvements - New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. - New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` - New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: - - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their `date_format` argument is currently ignored. + - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` .. _whatsnew_210.enhancements.other: From 022c185c43d303350f725c63d5c35adebaa05535 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 11 Nov 2023 15:33:14 +0100 Subject: [PATCH 064/115] Moved whatsnew items --- doc/source/whatsnew/v2.1.0.rst | 10 ---------- doc/source/whatsnew/v2.2.0.rst | 11 +++++++++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2b369b0ed3ef1..51b4c4f297b07 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -223,16 +223,6 @@ If the input contains NA values, the previous version would drop those as well w df.stack([0, 1], future_stack=False, dropna=False) df.stack([0, 1], future_stack=True) -.. _whatsnew_210.enhancements.enhancement3: - -Strftime performance improvements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. -- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` -- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: - - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` - .. _whatsnew_210.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index efa4a52993a90..921e98807fb3a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -14,6 +14,17 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_220.enhancements.strftime: + +Strftime performance improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. +- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` +- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: + - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` + + .. _whatsnew_220.enhancements.calamine: Calamine engine for :func:`read_excel` From 21b0a88f0c54eedf2c56f4154b87ca97abe35e9c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 11 Nov 2023 15:39:31 +0100 Subject: [PATCH 065/115] Re applied mods to `TestPeriodIndexFormat`, since it had moved somewhere else --- pandas/tests/indexes/period/test_formats.py | 61 ++++++++++++++++++--- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 81c79f7d18f2f..7b48040cddd39 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -239,31 +239,78 @@ def test_period_format_and_strftime_default(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" - def test_period_custom(self): + @pytest.mark.parametrize("fast_strftime", (False, True)) + def test_period_custom(self, fast_strftime): # GH#46252 custom formatting directives %l (ms) and %u (us) msg = "PeriodIndex.format is deprecated" # 3 digits - per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") + per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits - per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") + per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") + per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") + formatted = per.format( + date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" + @pytest.mark.parametrize("fast_strftime", (False, True)) + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_period_custom_pm(self, fast_strftime, locale_str): + """Test that using %p in the custom format work well""" + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # 9 digits + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime, + ) + assert ( + formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" + ) + assert ( + formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" + ) + # fmt: on + def test_period_tz(self): # Formatting periods created from a datetime with timezone. msg = r"PeriodIndex\.format is deprecated" From ec8036c98f1a3f55fda8c3b28e070baea26840ad Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 11 Nov 2023 15:48:42 +0100 Subject: [PATCH 066/115] Re applied mods to `TestDatetimeIndexFormat`, since it had moved somewhere else and was renamed since #55603 --- .../tests/indexes/datetimes/test_formats.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index b52eed8c509c6..9618be1cfb029 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -354,3 +354,52 @@ def test_format_date_explicit_date_format(self): formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") assert formatted[0] == "02-01-2003" assert formatted[1] == "UT" + + def test_format_datetime_tz(self): + """Test default `format()` with tz-aware datetime index.""" + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # Since tz is currently set as utc, we'll see 2012 + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert dt.format()[0] == "2012-12-31 23:00:00+00:00" + # If we set tz as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + with tm.assert_produces_warning(FutureWarning, match=msg): + assert dt.format()[0] == "2013-01-01 00:00:00+01:00" + + def test_format_datetime_tz_explicit(self): + """Test `format()` with tz-aware dt and a custom format string.""" + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + + # If tz is currently set as utc, we'll see 2012 + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2012-12-31__foo__23:00:00" + ) + # same with fancy format + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2012-12-31__foo__11:00:00{pm_local}" + ) + + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2013-01-01__foo__00:00:00" + ) + # same with fancy format + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2013-01-01__foo__12:00:00{am_local}" + ) From 85c65a6995a586000c6d699e9244cff913f566ab Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 11 Nov 2023 15:54:14 +0100 Subject: [PATCH 067/115] Re applied mods from `TestFastStrfTimeScalars` in the right places --- pandas/tests/scalar/period/test_period.py | 37 ++++++++++++++++++ pandas/tests/scalar/timestamp/test_formats.py | 39 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index add7867611303..b7eec958c2f4c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -757,6 +757,43 @@ def test_strftime(self): assert res == "2000-01-01 12:34:12" assert isinstance(res, str) + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_strftime_locale(self, locale_str): + """ + Test that `convert_strftime_format` and `fast_strftime` + work well together and rely on runtime locale + """ + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="period") + assert str_tmp == "%(ampm)s" + + # Period + am_per = pd.Period("2018-03-11 01:00", freq="H") + assert am_local == am_per.strftime("%p") + assert am_local == am_per.fast_strftime(str_tmp, loc_s) + pm_per = pd.Period("2018-03-11 13:00", freq="H") + assert pm_local == pm_per.strftime("%p") + assert pm_local == pm_per.fast_strftime(str_tmp, loc_s) + class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d7160597ea6d6..0d9e21f02b364 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -199,3 +199,42 @@ def test_repr_matches_pydatetime_tz_dateutil(self): dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_strftime_locale(self, locale_str): + """ + Test that `convert_strftime_format` and `fast_strftime` + work well together and rely on runtime locale + """ + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p", target="datetime") + assert str_tmp == "%(ampm)s" + + # Now what about the classes ? + # Timestamp + am_ts = Timestamp(2020, 1, 1, 1) + assert am_local == am_ts.strftime("%p") + assert am_local == am_ts.fast_strftime(str_tmp, loc_s) + pm_ts = Timestamp(2020, 1, 1, 13) + assert pm_local == pm_ts.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) + From bac97f940af2869a50ab9ff227a1f5a81639bdae Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 14:49:04 +0100 Subject: [PATCH 068/115] Ruff: Fixed invalid character in comment --- pandas/_libs/tslibs/strftime.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 0db0d1b849029..ef5b766d152ce 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -12,11 +12,11 @@ class UnsupportedStrFmtDirective(ValueError): _COMMON_UNSUPPORTED = ( # 1- Names not in the numpy or datetime attr representation - "%a", # Weekday as locale’s abbreviated name. - "%A", # Weekday as locale’s full name. + "%a", # Weekday as locale's abbreviated name. + "%A", # Weekday as locale's full name. "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. - "%b", # Month as locale’s abbreviated name. - "%B", # Month as locale’s full name. + "%b", # Month as locale's abbreviated name. + "%B", # Month as locale's full name. # 2- TODO Below Time offset and timezone information ... but may be hard "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] ("" if tz naive). "%Z", # Time zone name ("" if tz naive). @@ -28,9 +28,9 @@ class UnsupportedStrFmtDirective(ValueError): "%W", # Week number of the year (Monday as the first day of the week) as # a zero-padded decimal number. All days in a new year preceding the first # Monday are considered to be in week 0. - "%c", # Locale’s appropriate date and time representation. - "%x", # Locale’s appropriate date representation. - "%X", # Locale’s appropriate time representation. + "%c", # Locale's appropriate date and time representation. + "%x", # Locale's appropriate date representation. + "%X", # Locale's appropriate time representation. ) @@ -41,7 +41,7 @@ class UnsupportedStrFmtDirective(ValueError): "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. - "%p": ("ampm", "s"), # Locale’s equivalent of either AM or PM. + "%p": ("ampm", "s"), # Locale's equivalent of either AM or PM. "%M": ("min", "02d"), # Minute as a zero-padded decimal number. "%S": ("sec", "02d"), # Second as a zero-padded decimal number. } @@ -83,9 +83,9 @@ class LocaleSpecificDtStrings: Attributes ---------- am : str - Used in the %p strftime directive. Locale’s equivalent of AM. + Used in the %p strftime directive. Locale's equivalent of AM. pm : str - Used in the %p strftime directive. Locale’s equivalent of PM. + Used in the %p strftime directive. Locale's equivalent of PM. """ __slots__ = ("am", "pm") From 324052b9bbdb5127c45a2ec6fb0c7e35300df1b9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 14:49:42 +0100 Subject: [PATCH 069/115] black+isort+fixed tests --- pandas/core/indexes/datetimelike.py | 8 ++++--- pandas/io/formats/format.py | 1 - .../tests/indexes/datetimes/test_formats.py | 12 ++++++++++- pandas/tests/io/formats/test_format.py | 2 -- pandas/tests/scalar/period/test_period.py | 15 +++++++++++-- pandas/tests/scalar/timestamp/test_formats.py | 21 ++++++++++++++++--- 6 files changed, 47 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7f39ed30da9d9..ad5405fa2e74e 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -222,15 +222,17 @@ def format( return header + list(self.map(formatter)) return self._format_with_header( - header=header, na_rep=na_rep, date_format=date_format - , fast_strftime=fast_strftime + header=header, + na_rep=na_rep, + date_format=date_format, + fast_strftime=fast_strftime, ) def _format_with_header( self, *, header: list[str], - na_rep: str , + na_rep: str, date_format: str | None = None, fast_strftime: bool = True, ) -> list[str]: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8ccb47a1aeaa8..53f0e9051d6c9 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1697,7 +1697,6 @@ class _Datetime64TZFormatter(_Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" - ido = self.values._is_dates_only values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( is_dates_only_=False, diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 9618be1cfb029..a62d43a3394a0 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + time, +) import dateutil.tz import numpy as np @@ -293,6 +296,13 @@ def test_dti_business_repr_etc_smoke(self, tz, freq): dti[2:2]._summary() +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + class TestFormat: def test_format(self): # GH#35439 diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 4842d31c3241d..b520e2463353c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._libs.tslibs import convert_strftime_format - import pandas as pd from pandas import ( DataFrame, diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index b7eec958c2f4c..bd0a468ef104a 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1,8 +1,11 @@ +from contextlib import nullcontext from datetime import ( date, datetime, + time, timedelta, ) +import locale import numpy as np import pytest @@ -28,6 +31,7 @@ Period, Timedelta, Timestamp, + convert_strftime_format, offsets, ) import pandas._testing as tm @@ -586,6 +590,13 @@ def test_period_large_ordinal(self, hour): assert p.hour == hour +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + class TestPeriodMethods: def test_round_trip(self): p = Period("2000Q1") @@ -787,10 +798,10 @@ def test_strftime_locale(self, locale_str): assert str_tmp == "%(ampm)s" # Period - am_per = pd.Period("2018-03-11 01:00", freq="H") + am_per = Period("2018-03-11 01:00", freq="h") assert am_local == am_per.strftime("%p") assert am_local == am_per.fast_strftime(str_tmp, loc_s) - pm_per = pd.Period("2018-03-11 13:00", freq="H") + pm_per = Period("2018-03-11 13:00", freq="h") assert pm_local == pm_per.strftime("%p") assert pm_local == pm_per.fast_strftime(str_tmp, loc_s) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 0d9e21f02b364..fedc245206cac 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -1,11 +1,20 @@ -from datetime import datetime +from contextlib import nullcontext +from datetime import ( + datetime, + time, +) +import locale import pprint import dateutil.tz import pytest import pytz # a test below uses pytz but only inside a `eval` call -from pandas import Timestamp +from pandas import ( + Timestamp, + convert_strftime_format, +) +import pandas._testing as tm ts_no_ns = Timestamp( year=2019, @@ -87,6 +96,13 @@ def test_isoformat(ts, timespec, expected_iso): assert ts.isoformat(timespec=timespec) == expected_iso +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + class TestTimestampRendering: timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] @@ -237,4 +253,3 @@ def test_strftime_locale(self, locale_str): pm_ts = Timestamp(2020, 1, 1, 13) assert pm_local == pm_ts.strftime("%p") assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) - From 20e38ef6ca8546e4d0931e2520c442e551f15624 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 15:05:56 +0100 Subject: [PATCH 070/115] Implemented conservative fallback as suggested per code review --- pandas/_libs/tslibs/strftime.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index ef5b766d152ce..654fc2aca8d95 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -246,6 +246,10 @@ def convert_strftime_format( key, f"{esc_l}{_name}:{_fmt}{esc_r}" ) + # If there are remaining percent signs, be conservative and fallback + if "%" in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive found") + # Restore the %% into % strftime_fmt = strftime_fmt.replace(esc, "%") @@ -267,8 +271,9 @@ def convert_strftime_format( # for example replace "%d" by "%(day)02d" but with escaped % strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") - # Escape remaining percent signs - strftime_fmt = strftime_fmt.replace("%", "%%") + # If there are remaining percent signs, be conservative and fallback + if "%" in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive found") # Finally replace our placeholder strftime_fmt = strftime_fmt.replace(esc, "%") From 9602eb5368c9531f485862bca8c7b8bdea118297 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 17:10:27 +0100 Subject: [PATCH 071/115] black --- pandas/_libs/tslibs/strftime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 654fc2aca8d95..12b145404d420 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -248,7 +248,7 @@ def convert_strftime_format( # If there are remaining percent signs, be conservative and fallback if "%" in strftime_fmt: - raise UnsupportedStrFmtDirective(f"Unsupported directive found") + raise UnsupportedStrFmtDirective("Unsupported directive found") # Restore the %% into % strftime_fmt = strftime_fmt.replace(esc, "%") @@ -273,7 +273,7 @@ def convert_strftime_format( # If there are remaining percent signs, be conservative and fallback if "%" in strftime_fmt: - raise UnsupportedStrFmtDirective(f"Unsupported directive found") + raise UnsupportedStrFmtDirective("Unsupported directive found") # Finally replace our placeholder strftime_fmt = strftime_fmt.replace(esc, "%") From 9223451cd89c2c2f37a386fc4c514c7c2962da04 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 17:19:29 +0100 Subject: [PATCH 072/115] Implemented hypothesis tests as suggested per code review --- pandas/tests/arrays/test_datetimelike.py | 109 +++++++++++++++++++++++ pandas/tests/tslibs/test_strftime.py | 20 +++-- 2 files changed, 122 insertions(+), 7 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 5a5abcc4aa85d..ab5b8ee249bda 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,8 +1,12 @@ from __future__ import annotations +import datetime as dt import re import warnings +import hypothesis.strategies as st +from hypothesis import given + import numpy as np import pytest @@ -599,6 +603,74 @@ def test_from_integer_array(self): tm.assert_extension_array_equal(result, expected) +def _is_supported_directive(d: str) -> bool: + """ + Return True if strftime directive 'd' is supported on current platform + See https://strftime.org/ and https://stackoverflow.com/a/2073189/7262247 + """ + try: + dt.datetime(1700, 1, 1).strftime(d) + except ValueError: + return False + else: + return True + + +DT_STRFTIME_DIRECTIVES = [ + d + for d in [ + "%a", + "%A", + "%w", + "%d", + "%-d", + "%#d", + "%b", + "%B", + "%m", + "%-m", + "%#m", + "%y", + "%Y", + "%H", + "%-H", + "%#H", + "%I", + "%-I", + "%#I", + "%p", + "%M", + "%-M", + "%#M", + "%S", + "%-S", + "%#S", + "%f", + "%z", + "%Z", + "%j", + "%-j", + "%#j", + "%U", + "%-U", + "%#U", + "%W", + "%-W", + "%#W", + "%c", + "%x", + "%X", + "%%", + ] + if _is_supported_directive(d) +] +PERIOD_STRFTIME_DIRECTIVES = [ + d for d in DT_STRFTIME_DIRECTIVES if d not in ("%X", "%f", "%z", "%Z") +] +"""Note that even though periods are not timezone-aware (GH#45736), %z and %Z return +unexpected non empty result, hence their exclusion from this testing list.""" + + class TestDatetimeArray(SharedTests): index_cls = DatetimeIndex array_cls = DatetimeArray @@ -886,6 +958,25 @@ def test_strftime_nat(self): expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) + @given( + datetimes=st.lists( + st.datetimes( + min_value=dt.datetime(1700, 1, 1), max_value=dt.datetime(2200, 1, 1) + ) + ), + fmt=st.sets(st.sampled_from(DT_STRFTIME_DIRECTIVES), min_size=1), + ) + @pytest.mark.parametrize("tz_aware", (False, True)) + def test_strftime_hypo(self, datetimes, fmt, tz_aware): + """Test that idx.strftime's content is equivalent to datetime.strftime""" + fmt = "".join(fmt) + if tz_aware: + datetimes = [_dt.replace(tzinfo=dt.timezone.utc) for _dt in datetimes] + idx = pd.DatetimeIndex(datetimes) + result = idx.strftime(fmt) + expected = pd.Index([i.strftime(fmt) for i in datetimes]) + pd.testing.assert_index_equal(result, expected) + class TestTimedeltaArray(SharedTests): index_cls = TimedeltaIndex @@ -1153,6 +1244,24 @@ def test_strftime_nat(self): expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) + @given( + datetimes=st.lists( + st.datetimes( + min_value=dt.datetime(1700, 1, 1), max_value=dt.datetime(2200, 1, 1) + ) + ), + fmt=st.sets(st.sampled_from(PERIOD_STRFTIME_DIRECTIVES), min_size=1), + ) + def test_strftime_hypo(self, datetimes, fmt): + """Test that idx.strftime's content is equivalent to datetime.strftime + Note that periods are not timezone-aware see GH#45736 + """ + fmt = "".join(fmt) + idx = pd.PeriodIndex(datetimes, freq="s") + result = idx.strftime(fmt) + expected = pd.Index([i.strftime(fmt) for i in datetimes]) + pd.testing.assert_index_equal(result, expected) + @pytest.mark.parametrize( "arr,casting_nats", diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index a075fb6f67f1b..82a46a785836f 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -160,12 +160,18 @@ def test_invalid_datetime_directive(self): with pytest.raises(UnsupportedStrFmtDirective, match="Unsupported directive"): convert_strftime_format("%F", target="datetime") + # Make sure that the same directive is valid for periods + assert convert_strftime_format("%F", target="period")[0] == "%(Fyear)d" + + def test_invalid_period_directive(self): + """Test that using invalid strftime directives for period raises an error""" + with pytest.raises(UnsupportedStrFmtDirective, match="Unsupported directive"): + convert_strftime_format("%j", target="period") + def test_unknown_directive(self): - """Test that unknown directives (non strftime) are simply escaped.""" - res_str, _ = convert_strftime_format("%O", target="datetime") - assert res_str == "%%O" + """Test that unknown/not available strftime directives lead to an error.""" + with pytest.raises(ValueError, match="Unsupported directive"): + convert_strftime_format("%O", target="datetime") - res_str, _ = convert_strftime_format( - "%O", target="datetime", new_style_fmt=True - ) - assert res_str == "%O" + with pytest.raises(ValueError, match="Unsupported directive"): + convert_strftime_format("%O", target="datetime", new_style_fmt=True) From fb0e30c947a2c4959253eb2b3c5ac44efe7730bd Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Nov 2023 17:43:34 +0100 Subject: [PATCH 073/115] Removed the `fast_strftime` argument everywhere as suggested per code review --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_libs/tslib.pyi | 1 - pandas/_libs/tslibs/period.pyi | 2 -- pandas/_libs/tslibs/timestamps.pyi | 1 - pandas/core/arrays/datetimelike.py | 20 +++------------ pandas/core/arrays/datetimes.py | 9 ++----- pandas/core/arrays/period.py | 9 ++----- pandas/core/arrays/timedeltas.py | 7 +----- pandas/core/generic.py | 10 -------- pandas/core/indexes/base.py | 1 - pandas/core/indexes/datetimelike.py | 17 +++---------- pandas/core/indexes/datetimes.py | 4 +-- pandas/core/indexes/period.py | 4 +-- pandas/io/formats/csvs.py | 7 +----- pandas/io/formats/format.py | 16 +++--------- pandas/tests/arrays/test_datetimelike.py | 11 ++++---- pandas/tests/frame/methods/test_to_csv.py | 22 ++++++---------- pandas/tests/indexes/period/test_formats.py | 28 ++++++--------------- 18 files changed, 40 insertions(+), 131 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8bc8eaedd5189..eef7adff1f6d2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -20,7 +20,7 @@ Strftime performance improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. - New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` -- New ``fast_strftime`` boolean flag in all formatting procedures to enable faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: +- All formatting procedures support faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 97b3dfc10486a..5a340c1d88bc4 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,7 +10,6 @@ def format_array_from_datetime( format: str | None = ..., na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT - fast_strftime: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_with_unit_to_datetime( values: npt.NDArray[np.object_], diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index b98a1b7151988..a4aecd2ce0a09 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,7 +48,6 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, - fast_strftime: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests @@ -91,7 +90,6 @@ class Period(PeriodMixin): def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: Frequency = ...) -> Period: ... - def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, fmt: str | None) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 1492a4e479a8c..e23f01b800874 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -109,7 +109,6 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... - def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index db4d878680688..33b2f65340a3b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -332,11 +332,7 @@ def asi8(self) -> npt.NDArray[np.int64]: # Rendering Methods def _format_native_types( - self, - *, - na_rep: str | float = "NaT", - date_format=None, - fast_strftime: bool = True, + self, *, na_rep: str | float = "NaT", date_format=None ) -> npt.NDArray[np.object_]: """ Helper method for astype when converting to strings. @@ -1719,9 +1715,7 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime( - self, date_format: str, fast_strftime: bool = True - ) -> npt.NDArray[np.object_]: + def strftime(self, date_format: str) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1743,12 +1737,6 @@ def strftime( date_format : str Date format string (e.g. "%%Y-%%m-%%d"). - fast_strftime : bool, default True - If `True` (default) and the format permits it, a faster formatting - method will be used. See `convert_strftime_format`. - - .. versionadded:: 1.5.4 - Returns ------- ndarray[object] @@ -1772,9 +1760,7 @@ def strftime( 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - result = self._format_native_types( - date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime - ) + result = self._format_native_types(date_format=date_format, na_rep=np.nan) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index f2f98c7591603..ac56df210eb75 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -749,12 +749,7 @@ def astype(self, dtype, copy: bool = True): # Rendering Methods def _format_native_types( - self, - *, - na_rep: str | float = "NaT", - date_format=None, - fast_strftime: bool = True, - **kwargs, + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: if date_format is None and self._is_dates_only: # Only dates and no timezone: provide a default format @@ -766,7 +761,7 @@ def _format_native_types( format=date_format, na_rep=na_rep, reso=self._creso, - fast_strftime=fast_strftime, + fast_strftime=True, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index bb84919ad57b1..179f0ba0bb207 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -757,18 +757,13 @@ def _formatter(self, boxed: bool = False): return "'{}'".format def _format_native_types( - self, - *, - na_rep: str | float = "NaT", - date_format=None, - fast_strftime: bool = True, - **kwargs, + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: """ actually format my specific types """ return libperiod.period_array_strftime( - self.asi8, self.dtype._dtype_code, na_rep, date_format, fast_strftime + self.asi8, self.dtype._dtype_code, na_rep, date_format, fast_strftime=True ) # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f8ed97670867c..1f8015075c55b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -469,12 +469,7 @@ def _formatter(self, boxed: bool = False): return get_format_timedelta64(self, box=True) def _format_native_types( - self, - *, - na_rep: str | float = "NaT", - date_format=None, - fast_strftime: bool = True, - **kwargs, + self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 215bacd41f2e4..7918e43b48719 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3717,7 +3717,6 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., - fast_strftime: bool_t = ..., doublequote: bool_t = ..., escapechar: str | None = ..., decimal: str = ..., @@ -3745,7 +3744,6 @@ def to_csv( lineterminator: str | None = ..., chunksize: int | None = ..., date_format: str | None = ..., - fast_strftime: bool_t = ..., doublequote: bool_t = ..., escapechar: str | None = ..., decimal: str = ..., @@ -3780,7 +3778,6 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, - fast_strftime: bool_t = True, doublequote: bool_t = True, escapechar: str | None = None, decimal: str = ".", @@ -3874,12 +3871,6 @@ def to_csv( Rows to write at a time. date_format : str, default None Format string for datetime objects. - fast_strftime : bool, default True - If `True` (default) and the format permits it, a faster formatting - method will be used. See `convert_strftime_format`. - - .. versionadded:: 1.5.4 - doublequote : bool, default True Control quoting of `quotechar` inside a field. escapechar : str, default None @@ -3960,7 +3951,6 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, - fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9a5de922edce3..1b4e14f075f22 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1446,7 +1446,6 @@ def _get_values_for_csv( decimal: str_t = ".", float_format=None, date_format=None, - fast_strftime: bool = True, quoting=None, ) -> npt.NDArray[np.object_]: return get_values_for_csv( diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index ad5405fa2e74e..a3e6c50b21642 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -197,7 +197,6 @@ def format( formatter: Callable | None = None, na_rep: str = "NaT", date_format: str | None = None, - fast_strftime: bool = True, ) -> list[str]: """ Render a string representation of the Index. @@ -222,26 +221,16 @@ def format( return header + list(self.map(formatter)) return self._format_with_header( - header=header, - na_rep=na_rep, - date_format=date_format, - fast_strftime=fast_strftime, + header=header, na_rep=na_rep, date_format=date_format ) def _format_with_header( - self, - *, - header: list[str], - na_rep: str, - date_format: str | None = None, - fast_strftime: bool = True, + self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: # TODO: not reached in tests 2023-10-11 # matches base class except for whitespace padding and date_format return header + list( - self._get_values_for_csv( - na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime - ) + self._get_values_for_csv(na_rep=na_rep, date_format=date_format) ) @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 4635110340980..73143730085d6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -272,8 +272,8 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format: str, fast_strftime: bool = True) -> Index: - arr = self._data.strftime(date_format, fast_strftime=fast_strftime) + def strftime(self, date_format) -> Index: + arr = self._data.strftime(date_format) return Index(arr, name=self.name, dtype=object) @doc(DatetimeArray.tz_convert) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 9ee3b01b44b94..fc88b368507ed 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -199,8 +199,8 @@ def _resolution_obj(self) -> Resolution: # These are defined here instead of via inherit_names for mypy @doc(PeriodArray.strftime) - def strftime(self, date_format: str, fast_strftime: bool = True) -> Index: - arr = self._data.strftime(date_format, fast_strftime=fast_strftime) + def strftime(self, date_format: str) -> Index: + arr = self._data.strftime(date_format) return Index(arr, name=self.name, dtype=object) @doc( diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b0dfa87b18fc2..6e695c174ad4c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -71,7 +71,6 @@ def __init__( chunksize: int | None = None, quotechar: str | None = '"', date_format: str | None = None, - fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions | None = None, @@ -95,7 +94,6 @@ def __init__( self.escapechar = escapechar self.lineterminator = lineterminator or os.linesep self.date_format = date_format - self.fast_strftime = fast_strftime self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) @@ -184,7 +182,6 @@ def _number_format(self) -> dict[str, Any]: "na_rep": self.na_rep, "float_format": self.float_format, "date_format": self.date_format, - "fast_strftime": self.fast_strftime, "quoting": self.quoting, "decimal": self.decimal, } @@ -197,9 +194,7 @@ def data_index(self) -> Index: and self.date_format is not None ): # Format and replace missing entries with empty string - data_index = data_index.strftime( - date_format=self.date_format, fast_strftime=self.fast_strftime - ).fillna("") + data_index = data_index.strftime(date_format=self.date_format).fillna("") elif isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 53f0e9051d6c9..b3632eb7bee4e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -981,7 +981,6 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, - fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, errors: str = "strict", @@ -1012,7 +1011,6 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, - fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, @@ -1498,13 +1496,11 @@ def __init__( values: DatetimeArray, nat_rep: str = "NaT", date_format: None = None, - fast_strftime: bool = True, **kwargs, ) -> None: super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format - self.fast_strftime = fast_strftime def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" @@ -1514,9 +1510,7 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._format_native_types( - na_rep=self.nat_rep, - date_format=self.date_format, - fast_strftime=self.fast_strftime, + na_rep=self.nat_rep, date_format=self.date_format ) return fmt_values.tolist() @@ -1660,17 +1654,14 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only: bool, - nat_rep: str = "NaT", - date_format: str | None = None, - fast_strftime: bool = True, + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" if is_dates_only: str_date_fmt = loc_s = None - if date_format is not None and fast_strftime: + if date_format is not None: try: # Try to get the string formatting template for this format str_date_fmt, loc_s = convert_strftime_format( @@ -1701,7 +1692,6 @@ def _format_strings(self) -> list[str]: formatter = self.formatter or get_format_datetime64( is_dates_only_=False, date_format=self.date_format, - fast_strftime=self.fast_strftime, ) fmt_values = [formatter(x) for x in values] diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index dedeb214c94aa..a4a84318b8360 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,9 +4,8 @@ import re import warnings -import hypothesis.strategies as st from hypothesis import given - +import hypothesis.strategies as st import numpy as np import pytest @@ -667,7 +666,7 @@ def _is_supported_directive(d: str) -> bool: PERIOD_STRFTIME_DIRECTIVES = [ d for d in DT_STRFTIME_DIRECTIVES if d not in ("%X", "%f", "%z", "%Z") ] -"""Note that even though periods are not timezone-aware (GH#45736), %z and %Z return +"""Note that even though periods are not timezone-aware (GH#45736), %z and %Z return unexpected non empty result, hence their exclusion from this testing list.""" @@ -981,7 +980,7 @@ def test_strftime_hypo(self, datetimes, fmt, tz_aware): idx = pd.DatetimeIndex(datetimes) result = idx.strftime(fmt) expected = pd.Index([i.strftime(fmt) for i in datetimes]) - pd.testing.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) class TestTimedeltaArray(SharedTests): @@ -1263,10 +1262,10 @@ def test_strftime_hypo(self, datetimes, fmt): Note that periods are not timezone-aware see GH#45736 """ fmt = "".join(fmt) - idx = pd.PeriodIndex(datetimes, freq="s") + idx = PeriodIndex(datetimes, freq="s") result = idx.strftime(fmt) expected = pd.Index([i.strftime(fmt) for i in datetimes]) - pd.testing.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index df040ed0191c1..94c98ad477cc1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1013,16 +1013,13 @@ def test_to_csv_compression(self, df, encoding, compression): with tm.decompress_file(filename, compression) as fh: tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - @pytest.mark.parametrize("fast_strftime", (True, False)) - def test_to_csv_date_format(self, datetime_frame, fast_strftime): + def test_to_csv_date_format(self, datetime_frame): with tm.ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = datetime_frame.index datetime_frame = DataFrame( {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index ) - datetime_frame.to_csv( - path, date_format="%Y%m%d", fast_strftime=fast_strftime - ) + datetime_frame.to_csv(path, date_format="%Y%m%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1034,9 +1031,7 @@ def test_to_csv_date_format(self, datetime_frame, fast_strftime): tm.assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv( - path, date_format="%Y-%m-%d", fast_strftime=fast_strftime - ) + datetime_frame.to_csv(path, date_format="%Y-%m-%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1049,9 +1044,7 @@ def test_to_csv_date_format(self, datetime_frame, fast_strftime): # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv( - path, date_format="%Y%m%d", fast_strftime=fast_strftime - ) + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") test = read_csv(path, index_col=0) @@ -1070,15 +1063,14 @@ def test_to_csv_date_format(self, datetime_frame, fast_strftime): ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format="%Y-%m-%d", fast_strftime=fast_strftime) + nat_frame.to_csv(path, date_format="%Y-%m-%d") test = read_csv(path, parse_dates=[0, 1], index_col=0) tm.assert_frame_equal(test, nat_frame) - @pytest.mark.parametrize("fast_strftime", (True, False)) @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) - def test_to_csv_with_dst_transitions(self, td, fast_strftime): + def test_to_csv_with_dst_transitions(self, td): with tm.ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions times = date_range( @@ -1092,7 +1084,7 @@ def test_to_csv_with_dst_transitions(self, td, fast_strftime): i = i._with_freq(None) # freq is not preserved by read_csv time_range = np.array(range(len(i)), dtype="int64") df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True, fast_strftime=fast_strftime) + df.to_csv(path, index=True) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 7b48040cddd39..796ce6a1279eb 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -239,42 +239,31 @@ def test_period_format_and_strftime_default(self): assert formatted[0] == "2003-01-01 12:01:01.123456789" assert formatted[1] == "2003-01-01 12:01:01.123456790" - @pytest.mark.parametrize("fast_strftime", (False, True)) - def test_period_custom(self, fast_strftime): + def test_period_custom(self): # GH#46252 custom formatting directives %l (ms) and %u (us) msg = "PeriodIndex.format is deprecated" # 3 digits - per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") + per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format( - date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime, - ) + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits - per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") + per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format( - date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime, - ) + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") with tm.assert_produces_warning(FutureWarning, match=msg): - formatted = per.format( - date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime, - ) + formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" - @pytest.mark.parametrize("fast_strftime", (False, True)) @pytest.mark.parametrize( "locale_str", [ @@ -285,7 +274,7 @@ def test_period_custom(self, fast_strftime): "zh_CN", # Note: encoding will be 'gb2312' ], ) - def test_period_custom_pm(self, fast_strftime, locale_str): + def test_period_custom_pm(self, locale_str): """Test that using %p in the custom format work well""" # Skip if locale cannot be set @@ -301,7 +290,6 @@ def test_period_custom_pm(self, fast_strftime, locale_str): p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - fast_strftime=fast_strftime, ) assert ( formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" From 4ef8ec1cbae893afa9bc5d43df6f08ac406e150e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 15:34:47 +0100 Subject: [PATCH 074/115] Fixed docstring --- pandas/_libs/tslibs/strftime.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 12b145404d420..2981b113cc2b5 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -141,9 +141,8 @@ def convert_strftime_format( This method can be tested on a single instance of - - `datetime` or `Timestamp`, through - `pandas.core.tools.datetimes.fast_strftime`. The - result may be compared with `datetime.strftime` or `Timestamp.strftime` + - `Timestamp`, through `Timestamp.fast_strftime`. The result may be compared + with `Timestamp.strftime` - `Period` through `Period.fast_strftime`. The result may be compared with `Period.strftime`. @@ -151,8 +150,8 @@ def convert_strftime_format( On array-like objects, this method is used in several places: - Subclasses of `DatelikeOps` now rely on this method in their - `self.strftime(fmt, fast_strftime=True)` default implementation, which - delegates to `_format_native_types`. + `self.strftime(fmt)` default implementation, which delegates to + `_format_native_types`. - `DatetimeArray._format_native_types` relies on `tslib.format_array_from_datetime` which relies on this function @@ -160,8 +159,8 @@ def convert_strftime_format( - `TimedeltaArray._format_native_types` does not currently support custom formats. - In addition, `Datetime64Formatter` and `Datetime64TZFormatter` also - rely on this when their attribute `fast_strftime` is `True` (default). + In addition, `Datetime64Formatter` and `Datetime64TZFormatter` rely on this + too. Parameters ---------- From 3c50a7691d3a19b9aff0eac51993ad9e1130dc65 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 15:36:11 +0100 Subject: [PATCH 075/115] Trying to have meson understand that there is a py file in the tslib --- pandas/_libs/tslibs/meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 85410f771233f..5bf60ab0ab093 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -51,6 +51,7 @@ sources_to_install = [ 'offsets.pyi', 'parsing.pyi', 'period.pyi', + 'strftime.py', 'strptime.pyi', 'timedeltas.pyi', 'timestamps.pyi', From e41df335e57edaac78f8ec0777f3e665b688ceb7 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 15:42:46 +0100 Subject: [PATCH 076/115] Fixed ruff error --- pandas/tests/arrays/test_datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index a4a84318b8360..8e5a80f4849b6 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -977,7 +977,7 @@ def test_strftime_hypo(self, datetimes, fmt, tz_aware): fmt = "".join(fmt) if tz_aware: datetimes = [_dt.replace(tzinfo=dt.timezone.utc) for _dt in datetimes] - idx = pd.DatetimeIndex(datetimes) + idx = DatetimeIndex(datetimes) result = idx.strftime(fmt) expected = pd.Index([i.strftime(fmt) for i in datetimes]) tm.assert_index_equal(result, expected) From 6e0e092420a179bb4092644417f6be0fee4b5ec9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 16:11:51 +0100 Subject: [PATCH 077/115] Fixed tests --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b3632eb7bee4e..93594073f978b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1690,7 +1690,7 @@ def _format_strings(self) -> list[str]: """we by definition have a TZ""" values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( - is_dates_only_=False, + is_dates_only=False, date_format=self.date_format, ) fmt_values = [formatter(x) for x in values] From 83d5539f1a75c22b4270d2dbf7ae0cbd1e59304c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 16:37:40 +0100 Subject: [PATCH 078/115] Fixed test --- pandas/tests/indexes/period/test_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 796ce6a1279eb..cf53b4df16b96 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -287,7 +287,7 @@ def test_period_custom_pm(self, locale_str): am_local, pm_local = get_local_am_pm() # 9 digits - p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") formatted = p.format( date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", ) From cd4d4f08eedb740bf1f3ca411852c0a88e70cdce Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 16:39:20 +0100 Subject: [PATCH 079/115] Fixed test --- pandas/tests/io/formats/test_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 89b80083c6a5a..884633d9b8e96 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -314,7 +314,7 @@ def test_to_csv_datetime_format_index(self): def test_to_csv_period_format_index(self): """Test that formatting also works for period index""" # same for periods - df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="m")}) + df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="M")}) df_month = df_month.set_index("A") # default date_format From 4198ea67ca8b7adbd17b18351add8ad940fa7f6b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 17:43:00 +0100 Subject: [PATCH 080/115] Fixed test --- pandas/tests/indexes/period/test_formats.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index cf53b4df16b96..0ae3ca333fce8 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -288,16 +288,17 @@ def test_period_custom_pm(self, locale_str): # 9 digits p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") - formatted = p.format( - date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", - ) + msg = r"PeriodIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + ) assert ( formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" ) assert ( formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" ) - # fmt: on def test_period_tz(self): # Formatting periods created from a datetime with timezone. From 8e2ef29b8ecba48b0d46ad5873bbf3b2f741fbe0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 21:12:59 +0100 Subject: [PATCH 081/115] Fixed test for musl linux --- pandas/tests/arrays/test_datetimelike.py | 28 +++++++++--------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 8e5a80f4849b6..f80e2c9d5c2d1 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -15,6 +15,7 @@ Timestamp, ) from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat import is_platform_windows import pandas as pd from pandas import ( @@ -622,40 +623,31 @@ def _is_supported_directive(d: str) -> bool: "%A", "%w", "%d", - "%-d", - "%#d", + "%#d" if is_platform_windows() else "%-d", "%b", "%B", "%m", - "%-m", - "%#m", + "%#m" if is_platform_windows() else "%-m", "%y", "%Y", "%H", - "%-H", - "%#H", + "%#H" if is_platform_windows() else "%-H", "%I", - "%-I", - "%#I", + "%#I" if is_platform_windows() else "%-I", "%p", "%M", - "%-M", - "%#M", + "%#M" if is_platform_windows() else "%-M", "%S", - "%-S", - "%#S", + "%#S" if is_platform_windows() else "%-S", "%f", "%z", "%Z", "%j", - "%-j", - "%#j", + "%#j" if is_platform_windows() else "%-j", "%U", - "%-U", - "%#U", + "%#U" if is_platform_windows() else "%-U", "%W", - "%-W", - "%#W", + "%#W" if is_platform_windows() else "%-W", "%c", "%x", "%X", From d0c38457e6c82f548325f3733677d0f75fef2cb3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 21:53:10 +0100 Subject: [PATCH 082/115] Fixed mypy errors --- pandas/_libs/tslib.pyi | 1 + pandas/_libs/tslibs/period.pyi | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 5a340c1d88bc4..97b3dfc10486a 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,6 +10,7 @@ def format_array_from_datetime( format: str | None = ..., na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT + fast_strftime: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_with_unit_to_datetime( values: npt.NDArray[np.object_], diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 846d238beadbd..c2ae99c8e236e 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,6 +48,7 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, + fast_strftime: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests From a3fcd6c612e2b25b66071dc93ce67f50ac6536c5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 18 Nov 2023 21:54:52 +0100 Subject: [PATCH 083/115] Fixed mypy error --- pandas/_libs/tslibs/timestamps.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index c78c174e27902..2eb682e123d24 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -109,6 +109,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... From 0390344491cfd435bf563c93ce115a618594f261 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 19 Nov 2023 10:10:07 +0100 Subject: [PATCH 084/115] Fixed mypy error --- pandas/io/formats/csvs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6e695c174ad4c..d196a263b280d 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -194,7 +194,8 @@ def data_index(self) -> Index: and self.date_format is not None ): # Format and replace missing entries with empty string - data_index = data_index.strftime(date_format=self.date_format).fillna("") + data_index = data_index.strftime(date_format=self.date_format) + data_index = data_index.fillna("") # type: ignore[no-untyped-call] elif isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index From 42e87c640f066ebcfe30871453120e795fbb293c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 13 Jan 2024 15:40:56 +0100 Subject: [PATCH 085/115] Removed `convert_strftime_format` from top-level API and moved it to `pd.tseries.api`. --- doc/source/whatsnew/v2.2.0.rst | 4 ++-- pandas/__init__.py | 2 -- pandas/core/api.py | 2 -- pandas/tests/api/test_api.py | 1 - pandas/tests/scalar/period/test_period.py | 3 +-- pandas/tests/scalar/timestamp/test_formats.py | 6 ++---- pandas/tests/tslibs/test_strftime.py | 2 +- pandas/tseries/api.py | 3 ++- 8 files changed, 8 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index f743d76c746cf..5ef1cf70a826c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -101,9 +101,9 @@ Enhancements Strftime performance improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- New :func:`convert_strftime_format` to convert a strftime formatting template into a python string formatting template. +- New :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. - New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` -- All formatting procedures support faster strftime operations leveraging :func:`convert_strftime_format` and python string formatting: +- All formatting procedures support faster strftime operations leveraging :func:`pd.tseries.api.convert_strftime_format` and python string formatting: - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` diff --git a/pandas/__init__.py b/pandas/__init__.py index 0d314fdd77b5f..7fab662ed2de4 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -93,7 +93,6 @@ Interval, interval_range, DateOffset, - convert_strftime_format, # conversion to_numeric, to_datetime, @@ -299,7 +298,6 @@ "arrays", "bdate_range", "concat", - "convert_strftime_format", "crosstab", "cut", "date_range", diff --git a/pandas/core/api.py b/pandas/core/api.py index c373557fe3af9..2cfe5ffc0170d 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -3,7 +3,6 @@ Period, Timedelta, Timestamp, - convert_strftime_format, ) from pandas._libs.missing import NA @@ -117,7 +116,6 @@ "NaT", "notna", "notnull", - "convert_strftime_format", "Period", "PeriodDtype", "PeriodIndex", diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ed35466d576f9..60bcb97aaa364 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -107,7 +107,6 @@ class TestPDApi(Base): "array", "bdate_range", "concat", - "convert_strftime_format", "crosstab", "cut", "date_range", diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 67e46ade0c97e..e249ae9f14a36 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas._libs.tslibs import iNaT +from pandas._libs.tslibs import convert_strftime_format, iNaT from pandas._libs.tslibs.ccalendar import ( DAYS, MONTHS, @@ -24,7 +24,6 @@ Period, Timedelta, Timestamp, - convert_strftime_format, offsets, ) import pandas._testing as tm diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index fedc245206cac..d6cbe7ece6b80 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -10,10 +10,8 @@ import pytest import pytz # a test below uses pytz but only inside a `eval` call -from pandas import ( - Timestamp, - convert_strftime_format, -) +from pandas import Timestamp +from pandas.tseries.api import convert_strftime_format import pandas._testing as tm ts_no_ns = Timestamp( diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 82a46a785836f..65cf97a8045c3 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -12,7 +12,7 @@ get_current_locale_specific_string, ) -from pandas import convert_strftime_format +from pandas.tseries.api import convert_strftime_format import pandas._testing as tm diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..2362748293d4b 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -3,8 +3,9 @@ """ from pandas._libs.tslibs.parsing import guess_datetime_format +from pandas._libs.tslibs import convert_strftime_format from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["convert_strftime_format", "infer_freq", "offsets", "guess_datetime_format"] From 3dd707c4ffbc713c4e2d52ffb9920294810e9776 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sat, 13 Jan 2024 21:27:22 +0100 Subject: [PATCH 086/115] isort --- pandas/tests/scalar/period/test_period.py | 5 ++++- pandas/tests/scalar/timestamp/test_formats.py | 3 ++- pandas/tests/tslibs/test_strftime.py | 3 ++- pandas/tseries/api.py | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index e249ae9f14a36..52712637384dd 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -10,7 +10,10 @@ import numpy as np import pytest -from pandas._libs.tslibs import convert_strftime_format, iNaT +from pandas._libs.tslibs import ( + convert_strftime_format, + iNaT, +) from pandas._libs.tslibs.ccalendar import ( DAYS, MONTHS, diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 1fb1c5776a354..5a343be5b6410 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -11,9 +11,10 @@ import pytz # a test below uses pytz but only inside a `eval` call from pandas import Timestamp -from pandas.tseries.api import convert_strftime_format import pandas._testing as tm +from pandas.tseries.api import convert_strftime_format + ts_no_ns = Timestamp( year=2019, month=5, diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 65cf97a8045c3..8a06adbe3a2ad 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -12,9 +12,10 @@ get_current_locale_specific_string, ) -from pandas.tseries.api import convert_strftime_format import pandas._testing as tm +from pandas.tseries.api import convert_strftime_format + def get_local_am_pm(): """Return the AM and PM strings returned by strftime in current locale.""" diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index 2362748293d4b..201f5f662f084 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -2,8 +2,8 @@ Timeseries API """ -from pandas._libs.tslibs.parsing import guess_datetime_format from pandas._libs.tslibs import convert_strftime_format +from pandas._libs.tslibs.parsing import guess_datetime_format from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq From a382325315d24d569576b55365fc2dce76575449 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 3 Mar 2024 15:04:33 +0100 Subject: [PATCH 087/115] As per code review: instance level `fast_strftime` are now private `_fast_strftime` --- pandas/_libs/tslib.pyx | 2 +- pandas/_libs/tslibs/period.pyx | 6 +++--- pandas/_libs/tslibs/strftime.py | 4 ++-- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- pandas/io/formats/format.py | 2 +- pandas/tests/scalar/period/test_period.py | 4 ++-- pandas/tests/scalar/timestamp/test_formats.py | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9e3a87164491b..bee780a62f2e4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -253,7 +253,7 @@ def format_array_from_datetime( ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) # Use string formatting for faster strftime - res = ts.fast_strftime(str_format, loc_s) + res = ts._fast_strftime(str_format, loc_s) else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d3db365203897..d04e447b1b8ef 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1419,7 +1419,7 @@ def period_array_strftime( # freq = frequency_corresponding_to_dtype_code(dtype_code) # per = Period(ordinal, freq=freq) # if fast_fmt: - # item_repr = per.fast_strftime(fast_fmt, fast_loc) + # item_repr = per._fast_strftime(fast_fmt, fast_loc) # elif date_format: # item_repr = per.strftime(date_format) # else: @@ -2582,7 +2582,7 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. `fmt_str` and `loc_s` should be created using @@ -2598,7 +2598,7 @@ cdef class _Period(PeriodMixin): >>> a.strftime('%F-Q%q') '2006-Q1' >>> fast_fmt, loc_s = convert_strftime_format('%F-Q%q', target="period") - >>> a.fast_strftime(fast_fmt, loc_s) + >>> a._fast_strftime(fast_fmt, loc_s) '2006-Q1' """ value = self.ordinal diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 2981b113cc2b5..764d1f1229fb3 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -141,10 +141,10 @@ def convert_strftime_format( This method can be tested on a single instance of - - `Timestamp`, through `Timestamp.fast_strftime`. The result may be compared + - `Timestamp`, through `Timestamp._fast_strftime`. The result may be compared with `Timestamp.strftime` - - `Period` through `Period.fast_strftime`. The result may be compared + - `Period` through `Period._fast_strftime`. The result may be compared with `Period.strftime`. On array-like objects, this method is used in several places: diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 713addbc3d4f9..89b7a04acbc5e 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -107,7 +107,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... - def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... + def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index c95cd753db701..f0a088cec7b36 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1473,7 +1473,7 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) - def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: """A faster alternative to `strftime` using string formatting. `fmt_str` and `loc_s` should be created using @@ -1486,7 +1486,7 @@ class Timestamp(_Timestamp): >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S', target="datetime") - >>> ts.fast_strftime(fmt, loc_s) + >>> ts._fast_strftime(fmt, loc_s) '2020-03-14T15:32:52' """ y = self.year diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2e24c4a440808..a26e5c09f3e04 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1621,7 +1621,7 @@ def _format_datetime64_dateonly( if date_format: if str_date_fmt: # Faster, using string formatting - return x.fast_strftime(str_date_fmt, loc_s) + return x._fast_strftime(str_date_fmt, loc_s) else: # Slower return x.strftime(date_format) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 342c0cfe69838..1480a2d2e7fb1 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -830,10 +830,10 @@ def test_strftime_locale(self, locale_str): # Period am_per = Period("2018-03-11 01:00", freq="h") assert am_local == am_per.strftime("%p") - assert am_local == am_per.fast_strftime(str_tmp, loc_s) + assert am_local == am_per._fast_strftime(str_tmp, loc_s) pm_per = Period("2018-03-11 13:00", freq="h") assert pm_local == pm_per.strftime("%p") - assert pm_local == pm_per.fast_strftime(str_tmp, loc_s) + assert pm_local == pm_per._fast_strftime(str_tmp, loc_s) class TestPeriodProperties: diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 5a343be5b6410..ede0d8571b3a4 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -248,7 +248,7 @@ def test_strftime_locale(self, locale_str): # Timestamp am_ts = Timestamp(2020, 1, 1, 1) assert am_local == am_ts.strftime("%p") - assert am_local == am_ts.fast_strftime(str_tmp, loc_s) + assert am_local == am_ts._fast_strftime(str_tmp, loc_s) pm_ts = Timestamp(2020, 1, 1, 13) assert pm_local == pm_ts.strftime("%p") - assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) + assert pm_local == pm_ts._fast_strftime(str_tmp, loc_s) From d2ef85eea0b320b23ba540145ec8428de2427b10 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 3 Mar 2024 15:07:06 +0100 Subject: [PATCH 088/115] Added maintenance comment --- pandas/io/formats/csvs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d196a263b280d..5c461da6ffa60 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -316,6 +316,7 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] + # Format the values. Note: `self._number_format` includes `date_format` if any res = df._get_values_for_csv(**self._number_format) data = list(res._iter_column_arrays()) From b0471a7778b2ae943ac110532fcbdf9ffba96c81 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 3 Mar 2024 15:07:49 +0100 Subject: [PATCH 089/115] black+isort --- pandas/_libs/tslibs/strftime.py | 6 ++---- pandas/tests/arrays/test_datetimelike.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 764d1f1229fb3..20dcc00a74584 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -95,9 +95,7 @@ def __init__(self, am: str, pm: str) -> None: self.pm = pm def __repr__(self) -> str: - attrs = ", ".join( - [f"{k}={repr(getattr(self, k))}" for k in type(self).__slots__] - ) + attrs = ", ".join([f"{k}={getattr(self, k)!r}" for k in type(self).__slots__]) return f"{type(self).__name__}({attrs})" @classmethod @@ -219,7 +217,7 @@ def convert_strftime_format( directive_maps = (_COMMON_MAP, _PERIOD_MAP) unsupported = (_COMMON_UNSUPPORTED, _PERIOD_UNSUPPORTED) else: - raise ValueError(f"Invalid target: {repr(target)}") + raise ValueError(f"Invalid target: {target!r}") # Raise if unsupported directive found in `strftime_fmt` for _u in unsupported: diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index a913abf7c2f61..c140994a6ca19 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -15,8 +15,8 @@ Timestamp, ) from pandas._libs.tslibs import to_offset - from pandas.compat import is_platform_windows + from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd From bbbf7216df37adf49fd4b2f08c5a490e31aaded2 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 3 Mar 2024 15:16:31 +0100 Subject: [PATCH 090/115] Moved changelog to 3.0.0 and improved it slightly --- doc/source/whatsnew/v2.2.0.rst | 11 ----------- doc/source/whatsnew/v3.0.0.rst | 13 +++++++++---- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 36bfbaf9e6960..329ef2859f56f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -97,17 +97,6 @@ This future dtype inference logic can be enabled with: Enhancements ~~~~~~~~~~~~ -.. _whatsnew_220.enhancements.strftime: - -Strftime performance improvements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- New :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. -- New :meth:`Period.fast_strftime` and :meth:`Timestamp.fast_strftime` leveraging templates created with :func:`convert_strftime_format` -- All formatting procedures support faster strftime operations leveraging :func:`pd.tseries.api.convert_strftime_format` and python string formatting: - - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` - - .. _whatsnew_220.enhancements.adbc_support: ADBC Driver support in to_sql and read_sql diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0f125af599b12..c2fa565e62c1f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -19,10 +19,15 @@ Enhancements enhancement1 ^^^^^^^^^^^^ -.. _whatsnew_300.enhancements.enhancement2: - -enhancement2 -^^^^^^^^^^^^ +.. _whatsnew_300.enhancements.strftime: + +Strftime performance improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +- New :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. +- All formatting procedures support faster strftime operations thanks to :func:`pd.tseries.api.convert_strftime_format`: + - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` +- New instance-level :meth:`Period._fast_strftime` and :meth:`Timestamp._fast_strftime` leveraging templates created with :func:`convert_strftime_format` .. _whatsnew_300.enhancements.other: From 691b12746596bff841aaf37c9001bfea8546df7c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 3 Mar 2024 20:55:24 +0100 Subject: [PATCH 091/115] Fixed test: fast_strftime not part of api anymore (private) --- pandas/tests/scalar/test_nat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 754d1fc012a0c..0dc8084311fff 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -185,7 +185,6 @@ def test_nat_iso_format(get_nat): ( Timestamp, [ - "fast_strftime", "normalize", "to_julian_date", "to_period", From e0ddfcbdfd8e7257bdcc033098cbe5b16912cc77 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Mar 2024 22:16:41 +0100 Subject: [PATCH 092/115] formatting --- pandas/_libs/tslibs/strftime.py | 4 ++-- pandas/tests/tslibs/test_strftime.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 20dcc00a74584..f690833e76dcc 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -1,5 +1,5 @@ -"""Strftime-related classes and functions. -""" +"""Strftime-related classes and functions.""" + from __future__ import annotations from datetime import time diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 8a06adbe3a2ad..0d864186abcc5 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -1,6 +1,7 @@ """ Test datetime formatting low-level routines """ + from contextlib import nullcontext from datetime import time import locale From b09c641b575629e3a0f11d9e199e5fb122daab9e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Mar 2024 22:35:52 +0100 Subject: [PATCH 093/115] Fixed ASV benchmark: format method disappeared from DateTimeIndex --- asv_bench/benchmarks/strftime.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index a02362c9dda7d..783cabb1af6d6 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -32,14 +32,17 @@ def time_frame_date_to_str(self, nobs, tz_aware): def time_frame_date_formatting_default(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format=None) + def time_frame_date_formatting_index_to_str(self, nobs, tz_aware): + self.data.index.astype(str) + def time_frame_date_formatting_index_default(self, nobs, tz_aware): - self.data.index.format() + self.data.index.strftime(date_format=None) def time_frame_date_formatting_custom(self, nobs, tz_aware): self.data["d"].dt.strftime(date_format="%Y---%m---%d") def time_frame_date_formatting_index_custom(self, nobs, tz_aware): - self.data.index.format(date_format="%Y---%m---%d") + self.data.index.strftime(date_format="%Y---%m---%d") def time_frame_datetime_to_str(self, nobs, tz_aware): self.data["dt"].astype(str) @@ -56,14 +59,17 @@ def time_frame_datetime_formatting_default_explicit(self, nobs, tz_aware): def time_frame_datetime_formatting_default_with_float(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + def time_frame_datetime_formatting_index_to_str(self, nobs, tz_aware): + self.data.set_index("dt").index.astype(str) + def time_frame_datetime_formatting_index_default(self, nobs, tz_aware): - self.data.set_index("dt").index.format() + self.data.set_index("dt").index.strftime() def time_frame_datetime_formatting_custom(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") def time_frame_datetime_formatting_index_custom(self, nobs, tz_aware): - self.data.set_index("dt").index.format(date_format="%Y-%m-%d --- %H:%M:%S") + self.data.set_index("dt").index.strftime(date_format="%Y-%m-%d --- %H:%M:%S") def time_frame_datetime_formatting_iso8601_map(self, nobs, tz_aware): self.data["dt"].map(lambda timestamp: timestamp.isoformat()) From a519427db8e5342df6bbed00db13f1eb7805dfc1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Mar 2024 22:48:33 +0100 Subject: [PATCH 094/115] Improved what's new --- doc/source/whatsnew/v3.0.0.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4ea4834c37e74..cde0afe734b0e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -23,12 +23,16 @@ enhancement1 Strftime performance improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +All datetime formatting procedures support faster strftime operations, achieved with using python string formatting instead of OS ``strftime``:: +- in :meth:`DatetimeLikeArrayMixin.strftime`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. +- in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` + +This is achieved thanks to:: - New :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. -- All formatting procedures support faster strftime operations thanks to :func:`pd.tseries.api.convert_strftime_format`: - - in :meth:`DatetimeLikeArrayMixin.strftime` and :meth:`DatetimeIndexOpsMixin.format`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. - - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` - New instance-level :meth:`Period._fast_strftime` and :meth:`Timestamp._fast_strftime` leveraging templates created with :func:`convert_strftime_format` +strftime templates that can not be converted to such a fast python string template continue to be processed with OS ``strftime`` as fallback. + .. _whatsnew_300.enhancements.other: Other enhancements From 098e53e8bfecf90f7858529dcfacf53cc437ff9b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 8 Mar 2024 23:23:39 +0100 Subject: [PATCH 095/115] Fixed ASV error --- asv_bench/benchmarks/strftime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index 783cabb1af6d6..e169e6d940850 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -63,7 +63,7 @@ def time_frame_datetime_formatting_index_to_str(self, nobs, tz_aware): self.data.set_index("dt").index.astype(str) def time_frame_datetime_formatting_index_default(self, nobs, tz_aware): - self.data.set_index("dt").index.strftime() + self.data.set_index("dt").index.strftime(date_format=None) def time_frame_datetime_formatting_custom(self, nobs, tz_aware): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") From e71959954c5e89986e6dac0cec94d69cdadefca8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 2 Apr 2024 23:27:35 +0200 Subject: [PATCH 096/115] Code review: renamed `loc_s` into `locale_dt_strings` --- pandas/_libs/tslib.pyx | 10 ++++++---- pandas/_libs/tslibs/period.pyx | 12 ++++++------ pandas/_libs/tslibs/strftime.py | 6 +++--- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 6 +++--- pandas/io/formats/format.py | 12 ++++++------ 6 files changed, 25 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index bee780a62f2e4..ade3cee84f1cc 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -154,7 +154,7 @@ def format_array_from_datetime( _Timestamp ts object res npy_datetimestruct dts - object str_format, loc_s + object str_format, locale_dt_strings # Note that `result` (and thus `result_flat`) is C-order and # `it` iterates C-order as well, so the iteration matches @@ -199,7 +199,9 @@ def format_array_from_datetime( else: try: # Try to get the string formatting template for this format - str_format, loc_s = convert_strftime_format(format, target="datetime") + str_format, locale_dt_strings = convert_strftime_format( + format, target="datetime" + ) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` fast_strftime = False @@ -244,7 +246,7 @@ def format_array_from_datetime( "day": dts.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": loc_s.pm if (h // 12) else loc_s.am, + "ampm": locale_dt_strings.pm if (h // 12) else locale_dt_strings.am, "min": dts.min, "sec": dts.sec, "us": dts.us, @@ -253,7 +255,7 @@ def format_array_from_datetime( ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) # Use string formatting for faster strftime - res = ts._fast_strftime(str_format, loc_s) + res = ts._fast_strftime(str_format, locale_dt_strings) else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d04e447b1b8ef..7794f16401073 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2582,10 +2582,10 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: + def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` and `loc_s` should be created using + `fmt_str` and `locale_dt_strings` should be created using `convert_strftime_format(fmt, target="period")`. See also `self.strftime`, that relies on `period_format`. @@ -2597,8 +2597,8 @@ cdef class _Period(PeriodMixin): >>> a = Period(freq='Q-JUL', year=2006, quarter=1) >>> a.strftime('%F-Q%q') '2006-Q1' - >>> fast_fmt, loc_s = convert_strftime_format('%F-Q%q', target="period") - >>> a._fast_strftime(fast_fmt, loc_s) + >>> fast_fmt, loc_dt_strs = convert_strftime_format('%F-Q%q', target="period") + >>> a._fast_strftime(fast_fmt, loc_dt_strs) '2006-Q1' """ value = self.ordinal @@ -2613,8 +2613,8 @@ cdef class _Period(PeriodMixin): "dummy" if fmt_str is not None else None, None, fmt_str, - loc_s.am, - loc_s.pm + locale_dt_strings.am, + locale_dt_strings.pm ) def strftime(self, fmt: str | None) -> str: diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index f690833e76dcc..81fcaab5b39eb 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -183,10 +183,10 @@ def convert_strftime_format( `new_style_formatting`. For old-style, it may be used as `fmt_out % fmt_dct`. For new-style, it may be used as `fmt_out.format(**fmt_dct)` - loc_s : LocaleSpecificDtStrings + locale_dt_strings : LocaleSpecificDtStrings An object containing the locale-specific strings needed for some of the - directives. For example loc_s.am and loc_s.pm should be used to fill the "ampm" - part of the template, induced by directive %p. + directives. For example locale_dt_strings.am and locale_dt_strings.pm should be + used to fill the "ampm" part of the template, induced by directive %p. Raises ------ diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 89b7a04acbc5e..b995749fbd9dd 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -107,7 +107,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... - def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... + def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f0a088cec7b36..e048136026e1d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1473,10 +1473,10 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) - def _fast_strftime(self, fmt_str: str, loc_s: object) -> str: + def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: """A faster alternative to `strftime` using string formatting. - `fmt_str` and `loc_s` should be created using + `fmt_str` and `locale_dt_strings` should be created using `convert_strftime_format(fmt, target="datetime")`. See also `self.strftime`, that relies on `datetime.strftime`. @@ -1498,7 +1498,7 @@ class Timestamp(_Timestamp): "day": self.day, "hour": h, "hour12": 12 if h in (0, 12) else (h % 12), - "ampm": loc_s.pm if (h // 12) else loc_s.am, + "ampm": locale_dt_strings.pm if (h // 12) else locale_dt_strings.am, "min": self.minute, "sec": self.second, "us": self.microsecond, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ba4ba72c50822..12684491a903b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1613,16 +1613,16 @@ def _format_datetime64_dateonly( nat_rep: str = "NaT", date_format: str | None = None, str_date_fmt: str | None = None, - loc_s: object | None = None, + locale_dt_strings: object | None = None, ) -> str: - """str_date_fmt, loc_s are for fast strftime""" + """str_date_fmt, locale_dt_strings are for fast strftime""" if isinstance(x, NaTType): return nat_rep if date_format: if str_date_fmt: # Faster, using string formatting - return x._fast_strftime(str_date_fmt, loc_s) + return x._fast_strftime(str_date_fmt, locale_dt_strings) else: # Slower return x.strftime(date_format) @@ -1638,11 +1638,11 @@ def get_format_datetime64( a string as output""" if is_dates_only: - str_date_fmt = loc_s = None + str_date_fmt = locale_dt_strings = None if date_format is not None: try: # Try to get the string formatting template for this format - str_date_fmt, loc_s = convert_strftime_format( + str_date_fmt, locale_dt_strings = convert_strftime_format( date_format, target="datetime" ) except UnsupportedStrFmtDirective: @@ -1653,7 +1653,7 @@ def get_format_datetime64( nat_rep=nat_rep, date_format=date_format, str_date_fmt=str_date_fmt, - loc_s=loc_s, + locale_dt_strings=locale_dt_strings, ) else: # Relies on datetime.str, which is fast already From ffc661d550eabbd4db50f265773d2e8c7f00db43 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 00:03:09 +0200 Subject: [PATCH 097/115] Code review: moved whats new entry to perf --- doc/source/whatsnew/v3.0.0.rst | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 634ae060bacd9..14fc5d9c033e5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -19,19 +19,10 @@ Enhancements enhancement1 ^^^^^^^^^^^^ -.. _whatsnew_300.enhancements.strftime: +.. _whatsnew_300.enhancements.enhancement2: -Strftime performance improvements -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All datetime formatting procedures support faster strftime operations, achieved with using python string formatting instead of OS ``strftime``:: -- in :meth:`DatetimeLikeArrayMixin.strftime`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. -- in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` - -This is achieved thanks to:: -- New :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. -- New instance-level :meth:`Period._fast_strftime` and :meth:`Timestamp._fast_strftime` leveraging templates created with :func:`convert_strftime_format` - -strftime templates that can not be converted to such a fast python string template continue to be processed with OS ``strftime`` as fallback. +enhancement2 +^^^^^^^^^^^^ .. _whatsnew_300.enhancements.other: @@ -264,6 +255,13 @@ Removal of prior version deprecations/changes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ + +- Performance improvement in all datetime formatting procedures, achieved with using python string formatting instead of OS ``strftime`` (:issue:`51298`):: + + - in :meth:`DatetimeLikeArrayMixin.strftime`. Classes :class:`DatetimeArray`, :class:`PeriodArray`, :class:`DatetimeIndex`, :class:`PeriodIndex` benefit from the improvement. :class:`TimedeltaArray.strftime` and :class:`TimedeltaArray.format` are not impacted as their ``date_format`` argument is currently ignored. + - in :meth:`NDFrame.to_csv`, :meth:`DataFrameRenderer.to_csv` and :class:`CSVFormatter` + - This is achieved thanks to new :func:`pd.tseries.api.convert_strftime_format` to convert a strftime formatting template into a python string formatting template. strftime templates that can not be converted to such a fast python string template continue to be processed with OS ``strftime`` as fallback. + - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) From b6442d34d90ec7d9288d8b74420b351d0e130687 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 00:07:22 +0200 Subject: [PATCH 098/115] Code review: removed commented out code --- asv_bench/benchmarks/io/json.py | 4 ---- asv_bench/benchmarks/strftime.py | 10 ---------- 2 files changed, 14 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 17b6c36d6ba76..bcbfcdea42dd9 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -205,10 +205,6 @@ def setup(self, orient): def time_iso_format(self, orient): self.df.to_json(orient=orient, date_format="iso") - # Providing a custom `date_format` is not possible today, this test is pointless - # def time_custom_format(self, orient): - # self.df.to_json(orient=orient, date_format="%Y-%m-%d__%H:%M:%S") - class ToJSONLines(BaseIO): fname = "__test__.json" diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index e169e6d940850..19d9d0a587ee1 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -81,11 +81,6 @@ def time_frame_datetime_formatting_iso8601_strftime_offset(self, nobs, tz_aware) """Not optimized yet as %z is not supported by `convert_strftime_format`""" self.data["dt"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") - # def time_frame_datetime_formatting_iso8601_isoformat(self, nobs, tz_aware): - # TODO this PR is probably a good opportunity to add this too, or maybe - # another PR - # self.data["dt"].dt.isoformat() - class PeriodStrftime: timeout = 1500 @@ -124,11 +119,6 @@ def time_frame_period_formatting_default_explicit(self, nobs, freq): def time_frame_period_formatting_custom(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") - # def time_frame_period_formatting_iso8601_map(self, nobs, fq): - # TODO this PR is probably a good opportunity to add this too, or maybe - # another PR - # self.data["p"].map(lambda p: p.isoformat()) - def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq): self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") From 40a5c4873917d2cb105b80eb74d3053d89417f4c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 00:51:33 +0200 Subject: [PATCH 099/115] Code review: added support for negative and small years and added corresponding tests. --- pandas/_libs/tslib.pyx | 6 +++- pandas/_libs/tslibs/strftime.py | 2 +- pandas/_libs/tslibs/timestamps.pyx | 6 +++- pandas/tests/arrays/test_datetimelike.py | 31 +++++++++++++++++++ pandas/tests/scalar/timestamp/test_formats.py | 30 ++++++++++++++++++ 5 files changed, 72 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ade3cee84f1cc..9c198ba1555df 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -238,10 +238,14 @@ def format_array_from_datetime( # Use string formatting for faster strftime y = dts.year + shortyear = y % 100 + if y < 0 and shortyear != 0: + # Fix negative modulo to adopt C-style modulo + shortyear -= 100 h = dts.hour res = str_format % { "year": y, - "shortyear": y % 100, + "shortyear": shortyear, "month": dts.month, "day": dts.day, "hour": h, diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 81fcaab5b39eb..99edf81505cc8 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -37,7 +37,7 @@ class UnsupportedStrFmtDirective(ValueError): _COMMON_MAP = { "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. "%m": ("month", "02d"), # Month as a zero-padded decimal number. - "%Y": ("year", "d"), # Year with century as a decimal number. + "%Y": ("year", "04d"), # Year with century as a 0-padded decimal number. "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index e048136026e1d..a7f8d4eb738e1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1490,10 +1490,14 @@ class Timestamp(_Timestamp): '2020-03-14T15:32:52' """ y = self.year + shortyear = y % 100 + if y < 0 and shortyear != 0: + # Fix negative modulo to adopt C-style modulo + shortyear -= 100 h = self.hour return fmt_str % { "year": y, - "shortyear": y % 100, + "shortyear": shortyear, "month": self.month, "day": self.day, "hour": h, diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b4417286b26cc..6d0bbddbb04d9 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -964,6 +964,37 @@ def test_strftime_nat(self): expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) + def test_strftime_small_years(self): + np_dts = np.array( + ["220-01-01", "10-01-01", "1-01-01", "0-01-01"], "datetime64[s]" + ) + arr = DatetimeIndex(np_dts)._data + + result = arr.strftime("%Y-%m-%d__%y") + # Note that either -20 or 20 could be considered correct for the first %y. + # We opt for 20 to preserve the "two digits" property of %y. + expected = np.array( + ["0220-01-01__20", "0010-01-01__10", "0001-01-01__01", "0000-01-01__00"], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_negative(self): + np_neg = np.array( + ["-2020-01-01", "-220-01-01", "-200-01-01", "-1-01-01"], "datetime64[s]" + ) + arr = DatetimeIndex(np_neg)._data + + result = arr.strftime("%Y-%m-%d__%y") + # Note that either -20 or 20 could be considered correct for the first %y. + # We opt for -20 to have the same property than %Y. + # Similarly note that we have -1 instead of -01 + expected = np.array( + ["-2020-01-01__-20", "-220-01-01__-20", "-200-01-01__00", "-001-01-01__-1"], + dtype=object, + ) + tm.assert_numpy_array_equal(result, expected) + @given( datetimes=st.lists( st.datetimes( diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 8730cf12a7cd4..87f4bd684802f 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -9,6 +9,7 @@ import dateutil.tz import pytest import pytz # a test below uses pytz but only inside a `eval` call +from pytz.exceptions import NonExistentTimeError from pandas import Timestamp import pandas._testing as tm @@ -215,6 +216,35 @@ def test_repr_matches_pydatetime_tz_dateutil(self): dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + def test_timestamp_repr_strftime_small_year_date(self): + stamp = Timestamp("0020-01-01") + assert repr(stamp) == "Timestamp('20-01-01 00:00:00')" + + str_tmp, loc_dt_s = convert_strftime_format("%Y-%y", target="datetime") + assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0020-20" + + def test_timestamp_repr_strftime_negative_date(self): + stamp = Timestamp("-0020-01-01") + assert repr(stamp) == "Timestamp('-20-01-01 00:00:00')" + + with pytest.raises( + NotImplementedError, + match="strftime not yet supported on Timestamps which are outside the " + "range of Python's standard library.", + ): + stamp.strftime("%y") + + str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") + assert Timestamp("-0020-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-20" + assert Timestamp("-0002-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-2" + + str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") + assert Timestamp("-2020-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-2020" + assert Timestamp("-0200-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-200" + + with pytest.raises(NonExistentTimeError): + Timestamp("-0020-01-01", tz="US/Eastern") + @pytest.mark.parametrize( "locale_str", [ From b4983f85751f6954017eb7cde390c3e6fe92dd19 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 01:09:34 +0200 Subject: [PATCH 100/115] Improved test for small years --- pandas/tests/scalar/timestamp/test_formats.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 87f4bd684802f..0363787f1ad36 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -217,11 +217,14 @@ def test_repr_matches_pydatetime_tz_dateutil(self): assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_timestamp_repr_strftime_small_year_date(self): - stamp = Timestamp("0020-01-01") - assert repr(stamp) == "Timestamp('20-01-01 00:00:00')" + stamp = Timestamp("0002-01-01") + assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" - str_tmp, loc_dt_s = convert_strftime_format("%Y-%y", target="datetime") - assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0020-20" + # Make sure we agree with datetime.strftime + assert datetime(2, 1, 1).strftime("%Y") == "0002" + assert stamp.strftime("%Y") == "0002" + str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") + assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" def test_timestamp_repr_strftime_negative_date(self): stamp = Timestamp("-0020-01-01") From 159f1bdacdbb8d96a18ab5f84264aff3f6a97650 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 09:48:35 +0200 Subject: [PATCH 101/115] Fixed failing test --- pandas/tests/tslibs/test_strftime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 0d864186abcc5..a88c8fd6c4814 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -62,8 +62,8 @@ class TestConvertStrftimeFormat: ("%p", "%(ampm)s", "{ampm:s}"), ( "%m-%d-%Y", - "%(month)02d-%(day)02d-%(year)d", - "{month:02d}-{day:02d}-{year:d}", + "%(month)02d-%(day)02d-%(year)04d", + "{month:02d}-{day:02d}-{year:04d}", ), ( "20%y-%m-%d__foo__%I:%M:%S%p", From 76fb52e5e2f6d917a5eeadd74de9f61e948bb924 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 14:00:11 +0200 Subject: [PATCH 102/115] Fixed failing test --- pandas/tests/tslibs/test_strftime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index a88c8fd6c4814..1d417a288fa91 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -93,8 +93,8 @@ def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): ("%p", "%(ampm)s", "{ampm:s}"), ( "%m-%d-%Y", - "%(month)02d-%(day)02d-%(year)d", - "{month:02d}-{day:02d}-{year:d}", + "%(month)02d-%(day)02d-%(year)04d", + "{month:02d}-{day:02d}-{year:04d}", ), ( "%y %I:%M:%S%p (ms=%l us=%u ns=%n)", From 103d9cfeb0b83d88baad56158f73b5ff2c33ecc0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 14:41:04 +0200 Subject: [PATCH 103/115] Fixed test on linux --- pandas/tests/scalar/timestamp/test_formats.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 0363787f1ad36..8abdc9bec6ec2 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -220,11 +220,10 @@ def test_timestamp_repr_strftime_small_year_date(self): stamp = Timestamp("0002-01-01") assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" - # Make sure we agree with datetime.strftime - assert datetime(2, 1, 1).strftime("%Y") == "0002" - assert stamp.strftime("%Y") == "0002" - str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") - assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" + # Make sure we have zero-padding consistent with python strftime + assert stamp.strftime("%Y-%y") == "0002-02" + str_tmp, loc_dt_s = convert_strftime_format("%Y-%y", target="datetime") + assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002-02" def test_timestamp_repr_strftime_negative_date(self): stamp = Timestamp("-0020-01-01") From 478ea4ef08cb829f2d19aade2678ae83a3810105 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 16:22:07 +0200 Subject: [PATCH 104/115] Trying to debug datetime.strftime on linux --- pandas/tests/scalar/timestamp/test_formats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 8abdc9bec6ec2..3a8569ad3737a 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -220,10 +220,10 @@ def test_timestamp_repr_strftime_small_year_date(self): stamp = Timestamp("0002-01-01") assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" - # Make sure we have zero-padding consistent with python strftime - assert stamp.strftime("%Y-%y") == "0002-02" - str_tmp, loc_dt_s = convert_strftime_format("%Y-%y", target="datetime") - assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002-02" + # Make sure we have zero-padding, consistent with python strftime + assert stamp.strftime("%Y") == "0002", f"actual: {stamp.strftime('%Y')}" + str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") + assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" def test_timestamp_repr_strftime_negative_date(self): stamp = Timestamp("-0020-01-01") From 51431b3d9478780dc5c174efea0659bb56b0e8c3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 22:31:08 +0200 Subject: [PATCH 105/115] Fixed tests on linux and windows --- pandas/tests/scalar/timestamp/test_formats.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 3a8569ad3737a..a31a0406fce7d 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -11,6 +11,8 @@ import pytz # a test below uses pytz but only inside a `eval` call from pytz.exceptions import NonExistentTimeError +from pandas.compat import is_platform_linux + from pandas import Timestamp import pandas._testing as tm @@ -216,15 +218,30 @@ def test_repr_matches_pydatetime_tz_dateutil(self): dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + @pytest.mark.xfail( + is_platform_linux(), reason="strftime on linux does not zero-pad %Y" + ) def test_timestamp_repr_strftime_small_year_date(self): stamp = Timestamp("0002-01-01") assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" - # Make sure we have zero-padding, consistent with python strftime + # Make sure we have zero-padding, consistent with python strftime doc assert stamp.strftime("%Y") == "0002", f"actual: {stamp.strftime('%Y')}" str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" + # @pytest.mark.xfail( + # is_platform_linux(), reason="strftime on linux does not zero-pad %y" + # ) + def test_timestamp_repr_strftime_small_shortyear_date(self): + stamp = Timestamp("1902-01-01") + assert repr(stamp) == "Timestamp('1902-01-01 00:00:00')" + + # Make sure we have zero-padding, consistent with python strftime doc + assert stamp.strftime("%y") == "02", f"actual: {stamp.strftime('%y')}" + str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") + assert stamp._fast_strftime(str_tmp, loc_dt_s) == "02" + def test_timestamp_repr_strftime_negative_date(self): stamp = Timestamp("-0020-01-01") assert repr(stamp) == "Timestamp('-20-01-01 00:00:00')" From 847a9f3a8b283e5a92a44d6f31db1819b42e574d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 3 Apr 2024 22:56:57 +0200 Subject: [PATCH 106/115] Removed useless comment --- pandas/tests/scalar/timestamp/test_formats.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index a31a0406fce7d..d1f258b1a0939 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -230,9 +230,6 @@ def test_timestamp_repr_strftime_small_year_date(self): str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" - # @pytest.mark.xfail( - # is_platform_linux(), reason="strftime on linux does not zero-pad %y" - # ) def test_timestamp_repr_strftime_small_shortyear_date(self): stamp = Timestamp("1902-01-01") assert repr(stamp) == "Timestamp('1902-01-01 00:00:00')" From a21837cb4f57a86c132cf6aca4363f84d623e6f6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 4 Apr 2024 09:10:30 +0200 Subject: [PATCH 107/115] Fixed tests on linux musl --- pandas/tests/scalar/timestamp/test_formats.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index d1f258b1a0939..1e6111e68207c 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -218,15 +218,14 @@ def test_repr_matches_pydatetime_tz_dateutil(self): dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) - @pytest.mark.xfail( - is_platform_linux(), reason="strftime on linux does not zero-pad %Y" - ) def test_timestamp_repr_strftime_small_year_date(self): stamp = Timestamp("0002-01-01") assert repr(stamp) == "Timestamp('2-01-01 00:00:00')" # Make sure we have zero-padding, consistent with python strftime doc - assert stamp.strftime("%Y") == "0002", f"actual: {stamp.strftime('%Y')}" + # Note: current behaviour of strftime with %Y is OS-dependent + assert stamp.strftime("%Y") == "2" if is_platform_linux() else "0002" + # This is not OS-dependent str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" From fc03926166d7edccf7f8a813ca4b3b3f10800696 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 4 Apr 2024 12:43:29 +0200 Subject: [PATCH 108/115] Attempt to fix on MUSL --- pandas/tests/scalar/timestamp/test_formats.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 1e6111e68207c..74c4f41052f90 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -224,7 +224,11 @@ def test_timestamp_repr_strftime_small_year_date(self): # Make sure we have zero-padding, consistent with python strftime doc # Note: current behaviour of strftime with %Y is OS-dependent - assert stamp.strftime("%Y") == "2" if is_platform_linux() else "0002" + if is_platform_linux(): + assert stamp.strftime("%Y") == "2", f"Actual: {stamp.strftime('%Y')}" + else: + assert stamp.strftime("%Y") == "0002", f"Actual: {stamp.strftime('%Y')}" + # This is not OS-dependent str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" From 03dfc52515adfcfd382ce677245e7f8da9e7aba2 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 4 Apr 2024 13:06:18 +0200 Subject: [PATCH 109/115] Fixed test for linux musl --- pandas/tests/scalar/timestamp/test_formats.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 74c4f41052f90..77b0428888de5 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -11,7 +11,10 @@ import pytz # a test below uses pytz but only inside a `eval` call from pytz.exceptions import NonExistentTimeError -from pandas.compat import is_platform_linux +from pandas.compat import ( + ISMUSL, + is_platform_linux, +) from pandas import Timestamp import pandas._testing as tm @@ -224,7 +227,7 @@ def test_timestamp_repr_strftime_small_year_date(self): # Make sure we have zero-padding, consistent with python strftime doc # Note: current behaviour of strftime with %Y is OS-dependent - if is_platform_linux(): + if is_platform_linux() and not ISMUSL: assert stamp.strftime("%Y") == "2", f"Actual: {stamp.strftime('%Y')}" else: assert stamp.strftime("%Y") == "0002", f"Actual: {stamp.strftime('%Y')}" From 514e7858ddf9ecd461dbfdd2e7c68be790d146a8 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 30 Jun 2024 22:21:14 +0200 Subject: [PATCH 110/115] Simplified `convert_strftime_format` as per code review --- pandas/_libs/tslibs/strftime.py | 55 +++++++++---------- pandas/tests/scalar/timestamp/test_formats.py | 1 - pandas/tests/tslibs/test_strftime.py | 18 ++++-- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 99edf81505cc8..68faf93998068 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -225,54 +225,53 @@ def convert_strftime_format( if key in strftime_fmt: raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") - # Mapping between strftime and string formatting, according to both styles - if new_style_fmt: - esc = "/_+\\" + # Find an escape sequence, that we will use to replace all '%' signs + esc = _create_escape_sequence(strftime_fmt, init_esc="-+", prefix="-") - # Escape the %% before searching for directives, same as strftime - strftime_fmt = strftime_fmt.replace("%%", esc) + # Escape the %% before searching for directives (we will put them back at the end) + strftime_fmt = strftime_fmt.replace("%%", esc * 2) - esc_l = "+^_\\" - esc_r = "/_^+" + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + # Escape single curly braces + strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") # Create the output by replacing all directives for _map in directive_maps: for key, (_name, _fmt) in _map.items(): - # for example replace "%d" by "{day:02d}" but with escaped { } - strftime_fmt = strftime_fmt.replace( - key, f"{esc_l}{_name}:{_fmt}{esc_r}" - ) + # for example replace "%d" by "{day:02d}" + strftime_fmt = strftime_fmt.replace(key, f"{{{_name}:{_fmt}}}") # If there are remaining percent signs, be conservative and fallback if "%" in strftime_fmt: raise UnsupportedStrFmtDirective("Unsupported directive found") - # Restore the %% into % - strftime_fmt = strftime_fmt.replace(esc, "%") - - # Escape remaining curly braces - strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") - - # Finally replace our placeholders - strftime_fmt = strftime_fmt.replace(esc_l, "{").replace(esc_r, "}") - else: - esc = "/_^+" - - # Escape the %% before searching for directives, same as strftime - strftime_fmt = strftime_fmt.replace("%%", esc * 2) - # Create the output by replacing all directives for _map in directive_maps: for key, (_name, _fmt) in _map.items(): # for example replace "%d" by "%(day)02d" but with escaped % strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") - # If there are remaining percent signs, be conservative and fallback + # If there are remaining percent signs, raise 'unsupported directive' so that + # the caller can fallback to OS C strftime engine. if "%" in strftime_fmt: raise UnsupportedStrFmtDirective("Unsupported directive found") - # Finally replace our placeholder - strftime_fmt = strftime_fmt.replace(esc, "%") + # Restore the escaped %% + strftime_fmt = strftime_fmt.replace(esc, "%") return strftime_fmt, get_current_locale_specific_string() + + +def _create_escape_sequence(txt: str, init_esc: str = "+", prefix: str = "-") -> str: + """Return a unique string that does not exist in txt, by prepending as many + `prefix` as necessary to the initial proposal `init_esc`.""" + + if init_esc in prefix: + raise ValueError("`ini_esc` must not be a subset of `prefix`") + + # Prepend `ini_esc` with `str_to_add` as many times as necessary + while init_esc in txt: + init_esc = prefix + init_esc + return init_esc diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index dcd21319b7a33..0ba6c9e5678be 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -9,7 +9,6 @@ import dateutil.tz import pytest - from pytz.exceptions import NonExistentTimeError from pandas.compat import ( diff --git a/pandas/tests/tslibs/test_strftime.py b/pandas/tests/tslibs/test_strftime.py index 1d417a288fa91..ffdbff678247e 100644 --- a/pandas/tests/tslibs/test_strftime.py +++ b/pandas/tests/tslibs/test_strftime.py @@ -10,6 +10,7 @@ from pandas._libs.tslibs.strftime import ( UnsupportedStrFmtDirective, + _create_escape_sequence, get_current_locale_specific_string, ) @@ -104,11 +105,11 @@ def test_format_datetime(self, strftime_fmt, res_fmt_old, res_fmt_new): "(ms={ms:03d} us={us:06d} ns={ns:09d})", ), ( - "20%y-%m-%d__foo__%I:%M:%S%p", - "20%(shortyear)02d-%(month)02d-%(day)02d__foo__" - "%(hour12)02d:%(min)02d:%(sec)02d%(ampm)s", - "20{shortyear:02d}-{month:02d}-{day:02d}__foo__" - "{hour12:02d}:{min:02d}:{sec:02d}{ampm:s}", + "20%y-%m-%d__f{o}o__%I:%M:%S%%%p", + "20%(shortyear)02d-%(month)02d-%(day)02d__f{o}o__" + "%(hour12)02d:%(min)02d:%(sec)02d%%%(ampm)s", + "20{shortyear:02d}-{month:02d}-{day:02d}__f{{o}}o__" + "{hour12:02d}:{min:02d}:{sec:02d}%%{ampm:s}", ), ), ) @@ -177,3 +178,10 @@ def test_unknown_directive(self): with pytest.raises(ValueError, match="Unsupported directive"): convert_strftime_format("%O", target="datetime", new_style_fmt=True) + + +def test_create_escape_sequence(): + txt = "-*" + esc = _create_escape_sequence(txt, init_esc="*", prefix="-") + assert esc not in txt + assert esc == "--*" From d130c225a40cc764dc53216146b8355afba19e24 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 30 Jun 2024 22:32:34 +0200 Subject: [PATCH 111/115] Renamed `fast_strftime` with `strftime_pystr` as per code review --- pandas/_libs/tslib.pyi | 2 +- pandas/_libs/tslib.pyx | 16 ++++++++-------- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 12 ++++++------ pandas/_libs/tslibs/strftime.py | 4 ++-- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/io/formats/format.py | 2 +- pandas/tests/scalar/period/test_period.py | 6 +++--- pandas/tests/scalar/timestamp/test_formats.py | 18 +++++++++--------- 12 files changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 8318cc4df3120..c9de732ab614f 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,7 +10,7 @@ def format_array_from_datetime( format: str | None = ..., na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT - fast_strftime: bool = ..., + strftime_pystr: bool = ..., ) -> npt.NDArray[np.object_]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b3488daeb5886..d05d07b0dbf71 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -122,7 +122,7 @@ def format_array_from_datetime( str format=None, na_rep: str | float = "NaT", NPY_DATETIMEUNIT reso=NPY_FR_ns, - fast_strftime=True, + strftime_pystr=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -136,7 +136,7 @@ def format_array_from_datetime( na_rep : optional, default is None a nat format reso : NPY_DATETIMEUNIT, default NPY_FR_ns - fast_strftime : bool, default True + strftime_pystr : bool, default True If `True` (default) and the format permits it, a faster formatting method will be used. See `convert_strftime_format`. @@ -188,11 +188,11 @@ def format_array_from_datetime( # Sanity check - these flags are exclusive assert not (basic_format_day and basic_format) - if not basic_format_day and not basic_format and fast_strftime: - # Preprocessing for fast_strftime + if not basic_format_day and not basic_format and strftime_pystr: + # Preprocessing for strftime_pystr if format is None: # We'll fallback to the Timestamp.str method - fast_strftime = False + strftime_pystr = False else: try: # Try to get the string formatting template for this format @@ -201,7 +201,7 @@ def format_array_from_datetime( ) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` - fast_strftime = False + strftime_pystr = False for i in range(N): # Analogous to: utc_val = values[i] @@ -228,7 +228,7 @@ def format_array_from_datetime( elif show_ms: res += f".{dts.us // 1000:03d}" - elif fast_strftime: + elif strftime_pystr: if tz is None: pandas_datetime_to_datetimestruct(val, reso, &dts) @@ -256,7 +256,7 @@ def format_array_from_datetime( ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) # Use string formatting for faster strftime - res = ts._fast_strftime(str_format, locale_dt_strings) + res = ts._strftime_pystr(str_format, locale_dt_strings) else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 740e8d9779985..fca379650b5e3 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,7 +48,7 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, - fast_strftime: bool, + strftime_pystr: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a7698898f9d44..fe0400c85153d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1362,7 +1362,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt, npy_datetimestruct def period_array_strftime( - ndarray values, int dtype_code, object na_rep, str date_format, bint fast_strftime + ndarray values, int dtype_code, object na_rep, str date_format, bint strftime_pystr ): """ Vectorized Period.strftime used for PeriodArray._format_native_types. @@ -1374,7 +1374,7 @@ def period_array_strftime( Corresponds to PeriodDtype._dtype_code na_rep : any date_format : str or None - fast_strftime : bool + strftime_pystr : bool If `True` and the format permits it, a faster formatting method will be used. See `convert_strftime_format`. """ @@ -1400,7 +1400,7 @@ def period_array_strftime( # None or bytes already date_fmt_bytes = date_format - if fast_strftime and date_format is not None: + if strftime_pystr and date_format is not None: try: # Try to get the string formatting template for this format fast_fmt, fast_loc = convert_strftime_format(date_format, target="period") @@ -1421,7 +1421,7 @@ def period_array_strftime( # freq = frequency_corresponding_to_dtype_code(dtype_code) # per = Period(ordinal, freq=freq) # if fast_fmt: - # item_repr = per._fast_strftime(fast_fmt, fast_loc) + # item_repr = per._strftime_pystr(fast_fmt, fast_loc) # elif date_format: # item_repr = per.strftime(date_format) # else: @@ -2596,7 +2596,7 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: """A faster alternative to `strftime` using string formatting. `fmt_str` and `locale_dt_strings` should be created using @@ -2612,7 +2612,7 @@ cdef class _Period(PeriodMixin): >>> a.strftime('%F-Q%q') '2006-Q1' >>> fast_fmt, loc_dt_strs = convert_strftime_format('%F-Q%q', target="period") - >>> a._fast_strftime(fast_fmt, loc_dt_strs) + >>> a._strftime_pystr(fast_fmt, loc_dt_strs) '2006-Q1' """ value = self.ordinal diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py index 68faf93998068..61ee9e1617a9a 100644 --- a/pandas/_libs/tslibs/strftime.py +++ b/pandas/_libs/tslibs/strftime.py @@ -139,10 +139,10 @@ def convert_strftime_format( This method can be tested on a single instance of - - `Timestamp`, through `Timestamp._fast_strftime`. The result may be compared + - `Timestamp`, through `Timestamp._strftime_pystr`. The result may be compared with `Timestamp.strftime` - - `Period` through `Period._fast_strftime`. The result may be compared + - `Period` through `Period._strftime_pystr`. The result may be compared with `Period.strftime`. On array-like objects, this method is used in several places: diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index b995749fbd9dd..ac612d837c57f 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -107,7 +107,7 @@ class Timestamp(datetime): ) -> datetime: ... @classmethod def fromisoformat(cls, date_string: str) -> Self: ... - def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: ... + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b8fb1a54e2f53..f5cec05239296 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1588,7 +1588,7 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) - def _fast_strftime(self, fmt_str: str, locale_dt_strings: object) -> str: + def _strftime_pystr(self, fmt_str: str, locale_dt_strings: object) -> str: """A faster alternative to `strftime` using string formatting. `fmt_str` and `locale_dt_strings` should be created using @@ -1601,7 +1601,7 @@ class Timestamp(_Timestamp): >>> from pandas._libs.tslibs import convert_strftime_format >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S', target="datetime") - >>> ts._fast_strftime(fmt, loc_s) + >>> ts._strftime_pystr(fmt, loc_s) '2020-03-14T15:32:52' """ y = self.year diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8ed638ab7e2d4..6a05e375a0454 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -776,7 +776,7 @@ def _format_native_types( format=date_format, na_rep=na_rep, reso=self._creso, - fast_strftime=True, + strftime_pystr=True, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4a438a3be6bc9..c9626c1d68dfc 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -773,7 +773,7 @@ def _format_native_types( actually format my specific types """ return libperiod.period_array_strftime( - self.asi8, self.dtype._dtype_code, na_rep, date_format, fast_strftime=True + self.asi8, self.dtype._dtype_code, na_rep, date_format, strftime_pystr=True ) # ------------------------------------------------------------------ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 586934b823df5..888a3f132a7e7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1622,7 +1622,7 @@ def _format_datetime64_dateonly( if date_format: if str_date_fmt: # Faster, using string formatting - return x._fast_strftime(str_date_fmt, locale_dt_strings) + return x._strftime_pystr(str_date_fmt, locale_dt_strings) else: # Slower return x.strftime(date_format) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 8ce0babc30a84..b835cb6f6db77 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -810,7 +810,7 @@ def test_strftime(self): ) def test_strftime_locale(self, locale_str): """ - Test that `convert_strftime_format` and `fast_strftime` + Test that `convert_strftime_format` and `strftime_pystr` work well together and rely on runtime locale """ @@ -830,10 +830,10 @@ def test_strftime_locale(self, locale_str): # Period am_per = Period("2018-03-11 01:00", freq="h") assert am_local == am_per.strftime("%p") - assert am_local == am_per._fast_strftime(str_tmp, loc_s) + assert am_local == am_per._strftime_pystr(str_tmp, loc_s) pm_per = Period("2018-03-11 13:00", freq="h") assert pm_local == pm_per.strftime("%p") - assert pm_local == pm_per._fast_strftime(str_tmp, loc_s) + assert pm_local == pm_per._strftime_pystr(str_tmp, loc_s) class TestPeriodProperties: diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 0ba6c9e5678be..13801ba55114b 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -236,7 +236,7 @@ def test_timestamp_repr_strftime_small_year_date(self): # This is not OS-dependent str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") - assert stamp._fast_strftime(str_tmp, loc_dt_s) == "0002" + assert stamp._strftime_pystr(str_tmp, loc_dt_s) == "0002" def test_timestamp_repr_strftime_small_shortyear_date(self): stamp = Timestamp("1902-01-01") @@ -245,7 +245,7 @@ def test_timestamp_repr_strftime_small_shortyear_date(self): # Make sure we have zero-padding, consistent with python strftime doc assert stamp.strftime("%y") == "02", f"actual: {stamp.strftime('%y')}" str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") - assert stamp._fast_strftime(str_tmp, loc_dt_s) == "02" + assert stamp._strftime_pystr(str_tmp, loc_dt_s) == "02" def test_timestamp_repr_strftime_negative_date(self): stamp = Timestamp("-0020-01-01") @@ -259,12 +259,12 @@ def test_timestamp_repr_strftime_negative_date(self): stamp.strftime("%y") str_tmp, loc_dt_s = convert_strftime_format("%y", target="datetime") - assert Timestamp("-0020-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-20" - assert Timestamp("-0002-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-2" + assert Timestamp("-0020-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-20" + assert Timestamp("-0002-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-2" str_tmp, loc_dt_s = convert_strftime_format("%Y", target="datetime") - assert Timestamp("-2020-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-2020" - assert Timestamp("-0200-01-01")._fast_strftime(str_tmp, loc_dt_s) == "-200" + assert Timestamp("-2020-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-2020" + assert Timestamp("-0200-01-01")._strftime_pystr(str_tmp, loc_dt_s) == "-200" with pytest.raises(NonExistentTimeError): Timestamp("-0020-01-01", tz="US/Eastern") @@ -281,7 +281,7 @@ def test_timestamp_repr_strftime_negative_date(self): ) def test_strftime_locale(self, locale_str): """ - Test that `convert_strftime_format` and `fast_strftime` + Test that `convert_strftime_format` and `strftime_pystr` work well together and rely on runtime locale """ @@ -302,7 +302,7 @@ def test_strftime_locale(self, locale_str): # Timestamp am_ts = Timestamp(2020, 1, 1, 1) assert am_local == am_ts.strftime("%p") - assert am_local == am_ts._fast_strftime(str_tmp, loc_s) + assert am_local == am_ts._strftime_pystr(str_tmp, loc_s) pm_ts = Timestamp(2020, 1, 1, 13) assert pm_local == pm_ts.strftime("%p") - assert pm_local == pm_ts._fast_strftime(str_tmp, loc_s) + assert pm_local == pm_ts._strftime_pystr(str_tmp, loc_s) From 66b5bbeb9c46e07e69f2b029f7cf4363181b516d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 30 Jun 2024 23:06:40 +0200 Subject: [PATCH 112/115] Fixed test on WASM/Pyodide --- pandas/tests/scalar/timestamp/test_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index 13801ba55114b..cfa94a7944348 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -229,7 +229,7 @@ def test_timestamp_repr_strftime_small_year_date(self): # Make sure we have zero-padding, consistent with python strftime doc # Note: current behaviour of strftime with %Y is OS-dependent - if is_platform_linux() and not ISMUSL: + if is_platform_linux() and not ISMUSL and not WASM: assert stamp.strftime("%Y") == "2", f"Actual: {stamp.strftime('%Y')}" else: assert stamp.strftime("%Y") == "0002", f"Actual: {stamp.strftime('%Y')}" From 25ad4fbd3f12223f989ea2e614c896256c04386d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 1 Jul 2024 09:22:03 +0200 Subject: [PATCH 113/115] Renamed internal argument ``strftime_pystr`` into ``_use_pystr_engine`` to be more explicit. --- pandas/_libs/tslib.pyi | 2 +- pandas/_libs/tslib.pyx | 14 +++++++------- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 6 +++--- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/period.py | 6 +++++- pandas/tests/scalar/period/test_period.py | 2 +- pandas/tests/scalar/timestamp/test_formats.py | 2 +- 8 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index c9de732ab614f..b4c81629102b3 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -10,7 +10,7 @@ def format_array_from_datetime( format: str | None = ..., na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT - strftime_pystr: bool = ..., + _use_pystr_engine: bool = ..., ) -> npt.NDArray[np.object_]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d05d07b0dbf71..eef978cad6c10 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -122,7 +122,7 @@ def format_array_from_datetime( str format=None, na_rep: str | float = "NaT", NPY_DATETIMEUNIT reso=NPY_FR_ns, - strftime_pystr=True, + _use_pystr_engine=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -136,7 +136,7 @@ def format_array_from_datetime( na_rep : optional, default is None a nat format reso : NPY_DATETIMEUNIT, default NPY_FR_ns - strftime_pystr : bool, default True + _use_pystr_engine : bool, default True If `True` (default) and the format permits it, a faster formatting method will be used. See `convert_strftime_format`. @@ -188,11 +188,11 @@ def format_array_from_datetime( # Sanity check - these flags are exclusive assert not (basic_format_day and basic_format) - if not basic_format_day and not basic_format and strftime_pystr: - # Preprocessing for strftime_pystr + if not basic_format_day and not basic_format and _use_pystr_engine: + # Preprocessing for _use_pystr_engine if format is None: # We'll fallback to the Timestamp.str method - strftime_pystr = False + _use_pystr_engine = False else: try: # Try to get the string formatting template for this format @@ -201,7 +201,7 @@ def format_array_from_datetime( ) except UnsupportedStrFmtDirective: # Unsupported directive: fallback to standard `strftime` - strftime_pystr = False + _use_pystr_engine = False for i in range(N): # Analogous to: utc_val = values[i] @@ -228,7 +228,7 @@ def format_array_from_datetime( elif show_ms: res += f".{dts.us // 1000:03d}" - elif strftime_pystr: + elif _use_pystr_engine: if tz is None: pandas_datetime_to_datetimestruct(val, reso, &dts) diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index fca379650b5e3..1ae7b51141271 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -48,7 +48,7 @@ def period_array_strftime( dtype_code: int, na_rep, date_format: str | None, - strftime_pystr: bool, + _use_pystr_engine: bool, ) -> npt.NDArray[np.object_]: ... # exposed for tests diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index fe0400c85153d..0fb5b4fb7ce74 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1362,7 +1362,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt, npy_datetimestruct def period_array_strftime( - ndarray values, int dtype_code, object na_rep, str date_format, bint strftime_pystr + ndarray values, int dtype_code, object na_rep, str date_format, bint _use_pystr_engine ): """ Vectorized Period.strftime used for PeriodArray._format_native_types. @@ -1374,7 +1374,7 @@ def period_array_strftime( Corresponds to PeriodDtype._dtype_code na_rep : any date_format : str or None - strftime_pystr : bool + _use_pystr_engine : bool If `True` and the format permits it, a faster formatting method will be used. See `convert_strftime_format`. """ @@ -1400,7 +1400,7 @@ def period_array_strftime( # None or bytes already date_fmt_bytes = date_format - if strftime_pystr and date_format is not None: + if _use_pystr_engine and date_format is not None: try: # Try to get the string formatting template for this format fast_fmt, fast_loc = convert_strftime_format(date_format, target="period") diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6a05e375a0454..b50fd4bb0570f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -776,7 +776,7 @@ def _format_native_types( format=date_format, na_rep=na_rep, reso=self._creso, - strftime_pystr=True, + _use_pystr_engine=True, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c9626c1d68dfc..a6632a067f8bd 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -773,7 +773,11 @@ def _format_native_types( actually format my specific types """ return libperiod.period_array_strftime( - self.asi8, self.dtype._dtype_code, na_rep, date_format, strftime_pystr=True + self.asi8, + self.dtype._dtype_code, + na_rep, + date_format, + _use_pystr_engine=True, ) # ------------------------------------------------------------------ diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index b835cb6f6db77..331f565e33fc0 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -810,7 +810,7 @@ def test_strftime(self): ) def test_strftime_locale(self, locale_str): """ - Test that `convert_strftime_format` and `strftime_pystr` + Test that `convert_strftime_format` and `_strftime_pystr` work well together and rely on runtime locale """ diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index cfa94a7944348..badc1d2002b63 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -281,7 +281,7 @@ def test_timestamp_repr_strftime_negative_date(self): ) def test_strftime_locale(self, locale_str): """ - Test that `convert_strftime_format` and `strftime_pystr` + Test that `convert_strftime_format` and `_strftime_pystr` work well together and rely on runtime locale """ From 50015695cba112702dfc875da0148425ada4351d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 1 Jul 2024 09:22:30 +0200 Subject: [PATCH 114/115] Fixed test on pyodide --- pandas/tests/scalar/timestamp/test_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/scalar/timestamp/test_formats.py b/pandas/tests/scalar/timestamp/test_formats.py index badc1d2002b63..194c1000bc232 100644 --- a/pandas/tests/scalar/timestamp/test_formats.py +++ b/pandas/tests/scalar/timestamp/test_formats.py @@ -229,7 +229,7 @@ def test_timestamp_repr_strftime_small_year_date(self): # Make sure we have zero-padding, consistent with python strftime doc # Note: current behaviour of strftime with %Y is OS-dependent - if is_platform_linux() and not ISMUSL and not WASM: + if WASM or (is_platform_linux() and not ISMUSL): assert stamp.strftime("%Y") == "2", f"Actual: {stamp.strftime('%Y')}" else: assert stamp.strftime("%Y") == "0002", f"Actual: {stamp.strftime('%Y')}" From 48a0096a82391eeabb4d2b8d9ed53e56cf95340d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 1 Jul 2024 11:15:08 +0200 Subject: [PATCH 115/115] Fixed cython-lint error --- pandas/_libs/tslibs/period.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 0fb5b4fb7ce74..bb9e66cf4dab8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1362,7 +1362,11 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt, npy_datetimestruct def period_array_strftime( - ndarray values, int dtype_code, object na_rep, str date_format, bint _use_pystr_engine + ndarray values, + int dtype_code, + object na_rep, + str date_format, + bint _use_pystr_engine, ): """ Vectorized Period.strftime used for PeriodArray._format_native_types.