diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 4b02235ac9925..97d6311c62d5b 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -9,6 +9,7 @@ def format_array_from_datetime( tz: tzinfo | None = ..., format: str | None = ..., na_rep: object = ..., + fast_strftime: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_with_unit_to_datetime( values: np.ndarray, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9dfc438319148..13bf36b20ec13 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -60,6 +60,10 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here @@ -100,7 +104,8 @@ def format_array_from_datetime( ndarray[int64_t] values, tzinfo tz=None, str format=None, - object na_rep=None + object na_rep=None, + fast_strftime=True, ) -> np.ndarray: """ return a np object array of the string formatted values @@ -113,19 +118,23 @@ def format_array_from_datetime( a strftime capable string na_rep : optional, default is None a nat format + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. Returns ------- np.ndarray[object] """ cdef: - int64_t val, ns, N = len(values) + int64_t val, ns, y, h, N = len(values) ndarray[int64_t] consider_values bint show_ms = False, show_us = False, show_ns = False bint basic_format = False ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts + object str_format, loc_s if na_rep is None: na_rep = 'NaT' @@ -145,6 +154,28 @@ def format_array_from_datetime( consider_values //= 1000 show_ms = (consider_values % 1000).any() + elif format == "%Y-%m-%d %H:%M:%S": + # Same format as default, but with hardcoded precision (s) + basic_format = True + show_ns = show_us = show_ms = False + + elif format == "%Y-%m-%d %H:%M:%S.%f": + # Same format as default, but with hardcoded precision (us) + basic_format = show_us = True + show_ns = show_ms = False + + elif fast_strftime: + if format is None: + # We'll fallback to the Timestamp.str method + fast_strftime = False + else: + try: + # Try to get the string formatting template for this format + str_format, loc_s = convert_strftime_format(format) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + for i in range(N): val = values[i] @@ -166,10 +197,36 @@ def format_array_from_datetime( result[i] = res + elif fast_strftime: + + if tz is None: + dt64_to_dtstruct(val, &dts) + + # Use string formatting for faster strftime + y = dts.year + h = dts.hour + result[i] = str_format % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": dts.hour, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dts.min, + "sec": dts.sec, + "us": dts.us, + } + else: + ts = Timestamp(val, tz=tz) + + # Use string formatting for faster strftime + result[i] = ts.fast_strftime(str_format, loc_s) else: ts = Timestamp(val, tz=tz) if format is None: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) else: @@ -178,6 +235,7 @@ def format_array_from_datetime( try: result[i] = ts.strftime(format) except ValueError: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) return result diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 11de4e60f202d..80d7a9de19735 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -9,6 +9,8 @@ "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "Resolution", "Timedelta", "normalize_i8_timestamps", @@ -48,6 +50,10 @@ IncompatibleFrequency, Period, ) +from pandas._libs.tslibs.strftime import ( + UnsupportedStrFmtDirective, + convert_strftime_format, +) from pandas._libs.tslibs.timedeltas import ( Timedelta, delta_to_nanoseconds, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index fef98199d3dbc..089d8cb1e2396 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1566,8 +1566,10 @@ cdef class BusinessHour(BusinessMixin): def _repr_attrs(self) -> str: out = super()._repr_attrs() + # Use python string formatting to be faster than strftime + # f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' hours = ",".join( - f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' + f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}' for st, en in zip(self.start, self.end) ) attrs = [f"{self._prefix}={hours}"] diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 5ad919649262c..2b62426bb8d46 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -83,6 +83,7 @@ class Period(PeriodMixin): def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod def now(cls, freq: BaseOffset = ...) -> Period: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 986bbd8c8f856..be74617f073ec 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1158,45 +1158,88 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef str period_format(int64_t value, int freq, object fmt=None): - cdef: - int freq_group - if value == NPY_NAT: return "NaT" - if isinstance(fmt, str): - fmt = fmt.encode("utf-8") - if fmt is None: - freq_group = get_freq_group(freq) - if freq_group == FR_ANN: - fmt = b'%Y' - elif freq_group == FR_QTR: - fmt = b'%FQ%q' - elif freq_group == FR_MTH: - fmt = b'%Y-%m' - elif freq_group == FR_WK: - left = period_asfreq(value, freq, FR_DAY, 0) - right = period_asfreq(value, freq, FR_DAY, 1) - return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" - elif freq_group == FR_BUS or freq_group == FR_DAY: - fmt = b'%Y-%m-%d' - elif freq_group == FR_HR: - fmt = b'%Y-%m-%d %H:00' - elif freq_group == FR_MIN: - fmt = b'%Y-%m-%d %H:%M' - elif freq_group == FR_SEC: - fmt = b'%Y-%m-%d %H:%M:%S' - elif freq_group == FR_MS: - fmt = b'%Y-%m-%d %H:%M:%S.%l' - elif freq_group == FR_US: - fmt = b'%Y-%m-%d %H:%M:%S.%u' - elif freq_group == FR_NS: - fmt = b'%Y-%m-%d %H:%M:%S.%n' - else: - raise ValueError(f"Unknown freq: {freq}") + return _period_fast_strftime(value, freq) + else: + if isinstance(fmt, str): + fmt = fmt.encode("utf-8") + + return _period_strftime(value, freq, fmt) + + +cdef str _period_fast_strftime(int64_t value, int freq): + """A faster strftime alternative leveraging string formatting.""" + + cdef: + int freq_group, quarter + npy_datetimestruct dts + + # fill dts + get_date_info(value, freq, &dts) + + # get the appropriate format depending on frequency group + freq_group = get_freq_group(freq) + if freq_group == FR_ANN: + # fmt = b'%Y' + return f"{dts.year}" + + elif freq_group == FR_QTR: + # fmt = b'%FQ%q' + # get quarter and modify dts.year to be the fiscal year (?) + quarter = get_yq(value, freq, &dts) + return f"{dts.year}Q{quarter}" - return _period_strftime(value, freq, fmt) + elif freq_group == FR_MTH: + # fmt = b'%Y-%m' + return f"{dts.year}-{dts.month:02d}" + + elif freq_group == FR_WK: + # special: start_date/end_date. Recurse + left = period_asfreq(value, freq, FR_DAY, 0) + right = period_asfreq(value, freq, FR_DAY, 1) + return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" + + elif freq_group == FR_BUS or freq_group == FR_DAY: + # fmt = b'%Y-%m-%d' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d}" + + elif freq_group == FR_HR: + # fmt = b'%Y-%m-%d %H:00' + return f"{dts.year}-{dts.month:02d}-{dts.day:02d} {dts.hour:02d}:00" + + elif freq_group == FR_MIN: + # fmt = b'%Y-%m-%d %H:%M' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}") + + elif freq_group == FR_SEC: + # fmt = b'%Y-%m-%d %H:%M:%S' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") + + elif freq_group == FR_MS: + # fmt = b'%Y-%m-%d %H:%M:%S.%l' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(dts.us // 1_000):03d}") + + elif freq_group == FR_US: + # fmt = b'%Y-%m-%d %H:%M:%S.%u' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{(dts.us):06d}") + + elif freq_group == FR_NS: + # fmt = b'%Y-%m-%d %H:%M:%S.%n' + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}" + f".{((dts.us * 1000) + (dts.ps // 1000)):09d}") + + else: + raise ValueError(f"Unknown freq: {freq}") cdef list extra_fmts = [(b"%q", b"^`AB`^"), @@ -1234,7 +1277,7 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): # Execute c_strftime to process the usual datetime directives formatted = c_strftime(&dts, fmt) - result = util.char_to_string(formatted) + result = util.char_to_string_locale(formatted) free(formatted) # Now we will fill the placeholders corresponding to our additional directives @@ -2311,6 +2354,61 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + + See also `self.strftime`, that relies on `period_format`. + + Examples + -------- + + >>> from pandas._libs.tslibs import convert_strftime_format + >>> a = Period(freq='Q-JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> fast_fmt, loc_s = convert_strftime_format('%F-Q%q', target="period") + >>> a.fast_strftime(fast_fmt, loc_s) + '2006-Q1' + """ + freq = self._dtype._dtype_code + value = self.ordinal + + if value == NPY_NAT: + return "NaT" + + cdef: + npy_datetimestruct dts, dts2 + int quarter, y, h + + # Fill dts with all fields + get_date_info(value, freq, &dts) + + # Get the quarter and fiscal year + quarter = get_yq(value, freq, &dts2) + + # Finally use the string template + y = dts.year + h = dts.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": dts.month, + "day": dts.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dts.min, + "sec": dts.sec, + "ms": dts.us // 1000, + "us": dts.us, + "ns": (dts.us * 1000) + (dts.ps // 1000), + "q": quarter, + "Fyear": dts2.year, + "fyear": dts2.year % 100, + } + def strftime(self, fmt: str) -> str: r""" Returns the string representation of the :class:`Period`, depending diff --git a/pandas/_libs/tslibs/strftime.py b/pandas/_libs/tslibs/strftime.py new file mode 100644 index 0000000000000..2bcc69ae05e1c --- /dev/null +++ b/pandas/_libs/tslibs/strftime.py @@ -0,0 +1,265 @@ +"""Strftime-related classes and functions. +""" +from datetime import time +import locale +from typing import ( + Dict, + Tuple, +) + + +class UnsupportedStrFmtDirective(ValueError): + """The format contains a directive that is not supported in this context.""" + + +_COMMON_UNSUPPORTED = ( + # 1- Names not in the numpy or datetime attr representation + "%a", # Weekday as locale’s abbreviated name. + "%A", # Weekday as locale’s full name. + "%w", # Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. + "%b", # Month as locale’s abbreviated name. + "%B", # Month as locale’s full name. + # 2- TODO Below Time offset and timezone information ... but may be hard + "%z", # UTC offset in the form ±HHMM[SS[.ffffff]] ("" if tz naive). + "%Z", # Time zone name ("" if tz naive). + # 3- Probably too complex ones for now + "%j", # Day of the year as a zero-padded decimal number. + "%U", # Week number of the year (Sunday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Sunday are considered to be in week 0. + "%W", # Week number of the year (Monday as the first day of the week) as + # a zero-padded decimal number. All days in a new year preceding the first + # Monday are considered to be in week 0. + "%c", # Locale’s appropriate date and time representation. + "%x", # Locale’s appropriate date representation. + "%X", # Locale’s appropriate time representation. +) + + +_COMMON_MAP = { + "%d": ("day", "02d"), # Day of the month as a zero-padded decimal number. + "%m": ("month", "02d"), # Month as a zero-padded decimal number. + "%Y": ("year", "d"), # Year with century as a decimal number. + "%y": ("shortyear", "02d"), # Year without century as 0-padded decimal nb. + "%H": ("hour", "02d"), # Hour (24-hour clock) as 0-padded decimal number. + "%I": ("hour12", "02d"), # Hour (12-hour clock) as a 0-padded decimal nb. + "%p": ("ampm", "s"), # Locale’s equivalent of either AM or PM. + "%M": ("min", "02d"), # Minute as a zero-padded decimal number. + "%S": ("sec", "02d"), # Second as a zero-padded decimal number. +} + +_DATETIME_MAP = { + "%f": ("us", "06d"), # Microsecond as decimal number, 0-padded to 6 digits +} + +_PERIOD_MAP = { + "%f": ( + "fyear", + "02d", + ), # 'Fiscal' year without century as zero-padded decimal number [00,99] + "%F": ("Fyear", "d"), # 'Fiscal' year with century as a decimal number + "%q": ("q", "d"), # Quarter as a decimal number [1,4] + "%l": ("ms", "03d"), # Millisecond as decimal number, 0-padded 3 digits + "%u": ("us", "06d"), # Microsecond as decimal number, 0-padded 6 digits + "%n": ("ns", "09d"), # Nanosecond as decimal number, 0-padded 9 digits +} + + +class LocaleSpecificDtStrings: + """A container for date/time strings used in a specific locale. + + We will use these when formatting datetime as string using string templates, which + is faster than strftime when executed on arrays. + + `get_current_locale_specific_string()` is the recommended way to get an instance, + as it provides caching. + + Attributes + ---------- + am : str + Used in the %p strftime directive. Locale’s equivalent of AM. + pm : str + Used in the %p strftime directive. Locale’s equivalent of PM. + """ + + __slots__ = ("am", "pm") + + def __init__(self, am: str, pm: str): + self.am = am + self.pm = pm + + def __repr__(self): + attrs = ", ".join( + [f"{k}={repr(getattr(self, k))}" for k in type(self).__slots__] + ) + return f"{type(self).__name__}({attrs})" + + @classmethod + def get_current(cls): + return LocaleSpecificDtStrings( + am=time(1).strftime("%p"), + pm=time(13).strftime("%p"), + ) + + +_locale_specifics: Dict[str, LocaleSpecificDtStrings] = {} + + +def get_current_locale_specific_string() -> LocaleSpecificDtStrings: + """Return a `LocaleSpecificDtStrings` for the current locale. + + This function caches results in the `_locale_specifics` dict. + """ + global _locale_specifics + + # Get current locale + current_locale = locale.setlocale(locale.LC_ALL) + + try: + # Any entry in cache for current locale ? + return _locale_specifics[current_locale] + except KeyError: + # Create it using current locale, and cache it + o = LocaleSpecificDtStrings.get_current() + _locale_specifics[current_locale] = o + return o + + +def convert_strftime_format( + strftime_fmt: str, + target: str = "datetime", + new_style_fmt: bool = False, +) -> Tuple[str, LocaleSpecificDtStrings]: + """Convert a strftime formatting string into a formatting template string. + + The set of supported directives varies according to the `target`. + + This method can be tested on a single instance of + + - `datetime` or `Timestamp`, through + `pandas.core.tools.datetimes.fast_strftime`. The + result may be compared with `datetime.strftime` or `Timestamp.strftime` + + - `Period` through `Period.fast_strftime`. The result may be compared + with `Period.strftime`. + + On array-like objects, this method is used in several places: + + - Subclasses of `DatelikeOps` now rely on this method in their + `self.strftime(fmt, fast_strftime=True)` default implementation, which + delegates to `_format_native_types`. + + - `DatetimeArray._format_native_types` relies on + `tslib.format_array_from_datetime` which relies on this function + - `PeriodArray._format_native_types` directly relies on this function. + - `TimedeltaArray._format_native_types` does not currently support + custom formats. + + In addition, `Datetime64Formatter` and `Datetime64TZFormatter` also + rely on this when their attribute `fast_strftime` is `True` (default). + + Parameters + ---------- + strftime_fmt : str + The strftime format string specification, e.g. `"%Y-%m-%d %H:%M:%S"`. + Note that not all directives are eligible to successful usage of string + formatting. Unsupported directives will lead to an + `UnsupportedStrFmtDirective` being raised. + target : { "datetime", "date", "time", "period" }, default: "datetime" + The kind of data that will be formatted using this template. + new_style_fmt : bool, default: False + Whether the output string should be new-style + e.g. "{year}-{month:02d}-{day:02d} {hour:02d}:{min:02d}:{sec:02d}" + or old-style + e.g. "%(year)s-%(month)02d-%(day)02d %(hour)02d:%(min)02d:%(sec)02d" + + Returns + ------- + fmt_out : str + A string that may be used to format a `datetime` variable. The style of + this string is either old-style or new-style depending on + `new_style_formatting`. + For old-style, it may be used as `fmt_out % fmt_dct`. + For new-style, it may be used as `fmt_out.format(**fmt_dct)` + loc_s : LocaleSpecificDtStrings + An object containing the locale-specific strings needed for some of the + directives. For example loc_s.am and loc_s.pm should be used to fill the "ampm" + part of the template, induced by directive %p. + + Raises + ------ + UnsupportedStrFmtDirective + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. + + See Also + -------- + `strftime format codes reference `_ # noqa + + `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than datetime.strftime`. + + See `Period.strftime` doc for all the supported period directives (same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as specific additional directives ``%f``, ``%F``, + ``%q``, ``%l``, ``%u``, ``%n``). + """ + if target in ("datetime", "date", "time"): + directive_maps = (_COMMON_MAP, _DATETIME_MAP) + elif target == "period": + directive_maps = (_COMMON_MAP, _PERIOD_MAP) + else: + raise ValueError(f"Invalid target: {repr(target)}") + + # Raise if unsupported directive found in `strftime_fmt` + for key in _COMMON_UNSUPPORTED: + if key in strftime_fmt: + raise UnsupportedStrFmtDirective(f"Unsupported directive: '{key}'") + + # Mapping between strftime and string formatting, according to both styles + if new_style_fmt: + esc = "/_+\\" + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc) + + esc_l = "+^_\\" + esc_r = "/_^+" + + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "{day:02d}" but with escaped { } + strftime_fmt = strftime_fmt.replace( + key, f"{esc_l}{_name}:{_fmt}{esc_r}" + ) + + # Restore the %% into % + strftime_fmt = strftime_fmt.replace(esc, "%") + + # Escape remaining curly braces + strftime_fmt = strftime_fmt.replace("{", "{{").replace("}", "}}") + + # Finally replace our placeholders + strftime_fmt = strftime_fmt.replace(esc_l, "{").replace(esc_r, "}") + + else: + esc = "/_^+" + + # Escape the %% before searching for directives, same as strftime + strftime_fmt = strftime_fmt.replace("%%", esc * 2) + + # Create the output by replacing all directives + for _map in directive_maps: + for key, (_name, _fmt) in _map.items(): + # for example replace "%d" by "%(day)02d" but with escaped % + strftime_fmt = strftime_fmt.replace(key, f"{esc}({_name}){_fmt}") + + # Escape remaining percent signs + strftime_fmt = strftime_fmt.replace("%", "%%") + + # Finally replace our placeholder + strftime_fmt = strftime_fmt.replace(esc, "%") + + return strftime_fmt, get_current_locale_specific_string() diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 627006a7f32c0..1701b275cb9a9 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1196,6 +1196,7 @@ cdef class _Timedelta(timedelta): comp_dict = self.components._asdict() comp_dict['sign'] = sign + # TODO make faster using old-style formatting return fmt.format(**comp_dict) def __repr__(self) -> str: diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index f641c7fe1a12a..629c20f241e52 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -104,6 +104,7 @@ class Timestamp(datetime): def combine(cls, date: _date, time: _time) -> datetime: ... # type: ignore[override] @classmethod def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: ... def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 2afceb827e49a..0a5dcb1dfdb44 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -110,7 +110,6 @@ from pandas._libs.tslibs.tzconversion cimport ( # Constants _zero_time = time(0, 0) _no_input = object() - # ---------------------------------------------------------------------- @@ -819,7 +818,7 @@ cdef class _Timestamp(ABCTimestamp): @property def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but - # the datetime strftime() methods require year >= 1900 + # the datetime strftime() methods require year >= 1900 and is slower return f'{self.year}-{self.month:02d}-{self.day:02d}' @property @@ -1195,6 +1194,36 @@ class Timestamp(_Timestamp): tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) + def fast_strftime(self, fmt_str: str, loc_s: object) -> str: + """A faster alternative to `strftime` using string formatting. + + `fmt_str` and `loc_s` should be created using `convert_strftime_format(fmt)`. + + See also `self.strftime`, that relies on `datetime.strftime`. + + Examples + -------- + >>> from pandas._libs.tslibs import convert_strftime_format + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') + >>> fmt, loc_s = convert_strftime_format('%Y-%m-%dT%H:%M:%S') + >>> ts.fast_strftime(fmt, loc_s) + '2020-03-14T15:32:52' + """ + y = self.year + h = self.hour + return fmt_str % { + "year": y, + "shortyear": y % 100, + "month": self.month, + "day": self.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": self.minute, + "sec": self.second, + "us": self.microsecond, + } + def strftime(self, format): """ Timestamp.strftime(format) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 492b7d519551f..609324f7dd759 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -11,6 +11,15 @@ cdef extern from *: object char_to_string(const char* data) +cdef extern from *: + object PyUnicode_DecodeLocale(const char *str, const char *errors) + + +cdef inline object char_to_string_locale(const char* data): + """As opposed to PyUnicode_FromString, use the locale to decode.""" + return PyUnicode_DecodeLocale(data, NULL) + + cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8f0516abe8bb3..fc1939be20393 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -295,7 +295,9 @@ def asi8(self) -> npt.NDArray[np.int64]: # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, *, na_rep="NaT", date_format=None): + def _format_native_types( + self, *, na_rep="NaT", date_format=None, fast_strftime=True + ): """ Helper method for astype when converting to strings. @@ -1584,7 +1586,9 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime(self, date_format: str) -> npt.NDArray[np.object_]: + def strftime( + self, date_format: str, fast_strftime: bool = True + ) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1606,6 +1610,12 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: date_format : str Date format string (e.g. "%%Y-%%m-%%d"). + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. + + .. versionadded:: 1.5.0 + Returns ------- ndarray[object] @@ -1629,7 +1639,9 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan) + result = self._format_native_types( + date_format=date_format, na_rep=np.nan, fast_strftime=fast_strftime + ) return result.astype(object, copy=False) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9ffe33e0cf38e..ae290a034404e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -680,14 +680,18 @@ def astype(self, dtype, copy: bool = True): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + self.asi8, + tz=self.tz, + format=fmt, + na_rep=na_rep, + fast_strftime=fast_strftime, ) # ----------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fa543f6773634..2879011d2a75b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -19,6 +19,8 @@ NaT, NaTType, Timedelta, + UnsupportedStrFmtDirective, + convert_strftime_format, delta_to_nanoseconds, dt64arr_to_periodarr as c_dt64arr_to_periodarr, iNaT, @@ -634,18 +636,39 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> np.ndarray: """ actually format my specific types + + TODO maybe rather align with the way it is done in datetimes.py ? + (delegate all to a tslib.format_array_from_period cython numpy method) """ values = self.astype(object) + # Create the formatter function if date_format: - formatter = lambda per: per.strftime(date_format) + if fast_strftime: + try: + # Try to get the string formatting template for this format + str_format, loc_s = convert_strftime_format( + date_format, target="period" + ) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + fast_strftime = False + + if fast_strftime: + # Faster: python old-style string formatting + formatter = lambda per: per.fast_strftime(str_format, loc_s) + else: + # Slower: strftime + formatter = lambda per: per.strftime(date_format) else: + # Uses `_Period.str>format_period` that is faster than strftime too formatter = lambda per: str(per) + # Apply the formatter to all values in the array, possibly with a mask if self._hasna: mask = self._isnan values[mask] = na_rep diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index dc63cd92bbb2b..ccc8793aa08fe 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -430,10 +430,12 @@ def _formatter(self, boxed: bool = False): @dtl.ravel_compat def _format_native_types( - self, *, na_rep="NaT", date_format=None, **kwargs + self, *, na_rep="NaT", date_format=None, fast_strftime=True, **kwargs ) -> np.ndarray: from pandas.io.formats.format import get_format_timedelta64 + # Note: TimeDelta._repr_base already uses `fast_strftime` built-in + # (and does not take the `date_format` arg into account) formatter = get_format_timedelta64(self._ndarray, na_rep) return np.array([formatter(x) for x in self._ndarray]) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 98e0ab43f2a09..d7b74a9d3d562 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3411,6 +3411,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool_t = True, doublequote: bool_t = True, escapechar: str | None = None, decimal: str = ".", @@ -3501,6 +3502,12 @@ def to_csv( Rows to write at a time. date_format : str, default None Format string for datetime objects. + fast_strftime : bool, default True + If `True` (default) and the format permits it, a faster formatting + method will be used. See `convert_strftime_format`. + + .. versionadded:: 1.5.0 + doublequote : bool, default True Control quoting of `quotechar` inside a field. escapechar : str, default None @@ -3583,6 +3590,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 25b7a5c3d3689..435f685975c45 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -156,6 +156,7 @@ def format( formatter: Callable | None = None, na_rep: str = "NaT", date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: """ Render a string representation of the Index. @@ -171,14 +172,22 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header( + header, na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, + header: list[str], + na_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( - self._format_native_types(na_rep=na_rep, date_format=date_format) + self._format_native_types( + na_rep=na_rep, date_format=date_format, fast_strftime=fast_strftime + ) ) @property diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3954cb28c2aca..50eecd0c15ac9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -260,8 +260,8 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # methods that dispatch to DatetimeArray and wrap result @doc(DatetimeArray.strftime) - def strftime(self, date_format) -> Index: - arr = self._data.strftime(date_format) + def strftime(self, date_format, fast_strftime: bool = True) -> Index: + arr = self._data.strftime(date_format, fast_strftime=fast_strftime) return Index(arr, name=self.name) @doc(DatetimeArray.tz_convert) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 46fd1cad97440..a3ccdebec8dfb 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -25,6 +25,7 @@ Timedelta, Timestamp, conversion, + convert_strftime_format, iNaT, nat_strings, parsing, @@ -1308,3 +1309,67 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): from pandas.core.tools.times import to_time return to_time(arg, format, infer_time_format, errors) + + +def fast_strftime( + dt: datetime, + fmt: str, + new_style_fmt: bool = False, +) -> str: + """A faster version of `datetime.strftime` using python string formatting. + + Returns a string representing the date and time, controlled by an explicit + format string using named attributes of `dt`. + For a complete list of formatting directives, see + `strftime() and strptime() Behavior `_. + + Parameters + ---------- + dt : datetime + The `datetime` instance to convert. + fmt : str + The format string + new_style_fmt : bool, default: False + A boolean indicating if old- or new-style python string formatting should be used. + By default the old-style formatting is enabled, as it is faster as of python <= 3.9. + + Returns + ------- + out_str : str + A string representing the datetime using the format specified. + + Raises + ------ + UnsupportedStrFmtDirective + Raised when the received `strftime_fmt` format contains a directive for + which the output can not currently be created using string formatting. + + See Also + -------- + - `strftime format codes reference `_ # noqa + - `Stackoverflow post `_ + explaining how old-style formatting is faster than new-style formatting, + itself faster than `datetime.strftime`. + """ + # common dict used for formatting + y = dt.year + h = dt.hour + + # get the formatting template + fmt_str, loc_s = convert_strftime_format(fmt, new_style_fmt=new_style_fmt) + + fmt_dct = { + "year": y, + "shortyear": y % 100, + "month": dt.month, + "day": dt.day, + "hour": h, + "hour12": 12 if h in (0, 12) else (h % 12), + "ampm": loc_s.pm if (h // 12) else loc_s.am, + "min": dt.minute, + "sec": dt.second, + "us": dt.microsecond, + } + + # execute + return fmt_str.format(**fmt_dct) if new_style_fmt else (fmt_str % fmt_dct) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f5367df6f228d..f59ddc7ad832b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -196,7 +196,8 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: TableCell(valuetype="date", datevalue=value, attributes=attributes), ) elif isinstance(val, datetime.date): - value = val.strftime("%Y-%m-%d") + # value = val.strftime("%Y-%m-%d") (slower) + value = f"{val.year}-{val.month:02d}-{val.day:02d}" # faster pvalue = val.strftime("%x") return ( pvalue, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3fd2a5e2bca32..fdd82ffb94260 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -29,12 +29,9 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCIndex, ABCMultiIndex, - ABCPeriodIndex, ) -from pandas.core.dtypes.missing import notna from pandas.core.indexes.api import Index @@ -63,6 +60,7 @@ def __init__( chunksize: int | None = None, quotechar: str | None = '"', date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, storage_options: StorageOptions = None, @@ -86,6 +84,7 @@ def __init__( self.escapechar = escapechar self.lineterminator = lineterminator or os.linesep self.date_format = date_format + self.fast_strftime = fast_strftime self.cols = self._initialize_columns(cols) self.chunksize = self._initialize_chunksize(chunksize) @@ -172,6 +171,7 @@ def _number_format(self) -> dict[str, Any]: "na_rep": self.na_rep, "float_format": self.float_format, "date_format": self.date_format, + "fast_strftime": self.fast_strftime, "quoting": self.quoting, "decimal": self.decimal, } @@ -179,14 +179,7 @@ def _number_format(self) -> dict[str, Any]: @cache_readonly def data_index(self) -> Index: data_index = self.obj.index - if ( - isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and self.date_format is not None - ): - data_index = Index( - [x.strftime(self.date_format) if notna(x) else "" for x in data_index] - ) - elif isinstance(data_index, ABCMultiIndex): + if isinstance(data_index, ABCMultiIndex): data_index = data_index.remove_unused_levels() return data_index diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ea938c924ae0c..67c1d8156f19c 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -42,6 +42,8 @@ NaT, Timedelta, Timestamp, + UnsupportedStrFmtDirective, + convert_strftime_format, iNaT, ) from pandas._libs.tslibs.nattype import NaTType @@ -1144,6 +1146,7 @@ def to_csv( lineterminator: str | None = None, chunksize: int | None = None, date_format: str | None = None, + fast_strftime: bool = True, doublequote: bool = True, escapechar: str | None = None, errors: str = "strict", @@ -1174,6 +1177,7 @@ def to_csv( chunksize=chunksize, quotechar=quotechar, date_format=date_format, + fast_strftime=fast_strftime, doublequote=doublequote, escapechar=escapechar, storage_options=storage_options, @@ -1613,11 +1617,13 @@ def __init__( values: np.ndarray | Series | DatetimeIndex | DatetimeArray, nat_rep: str = "NaT", date_format: None = None, + fast_strftime: bool = True, **kwargs, ) -> None: super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format + self.fast_strftime = fast_strftime def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" @@ -1630,7 +1636,9 @@ def _format_strings(self) -> list[str]: return [self.formatter(x) for x in values] fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format + na_rep=self.nat_rep, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) return fmt_values.tolist() @@ -1759,6 +1767,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep + # Note: this seems to rely on datetime.datetime.__str__ = isoformat + # so it already uses string templating rather than strftime (faster). return str(x) @@ -1766,25 +1776,56 @@ def _format_datetime64_dateonly( x: NaTType | Timestamp, nat_rep: str = "NaT", date_format: str | None = None, + str_date_fmt: str | None = None, + loc_s: object | None = None, ) -> str: if isinstance(x, NaTType): return nat_rep if date_format: - return x.strftime(date_format) + if str_date_fmt: + # Faster, using string formatting + return x.fast_strftime(str_date_fmt, loc_s) + else: + # Slower + return x.strftime(date_format) else: + # error: Item "NaTType" of "Union[NaTType, Any]" has no attribute "_date_repr" + # The underlying problem here is that mypy doesn't understand that NaT + # is a singleton, so that the check above excludes it here. + # + # Note: this relies on string templating (faster than strftime) return x._date_repr def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, + nat_rep: str = "NaT", + date_format: str | None = None, + fast_strftime: bool = True, ) -> Callable: + """Return a formatter callable taking a datetime64 as input and providing + a string as output""" if is_dates_only: + str_date_fmt = loc_s = None + if date_format is not None and fast_strftime: + try: + # Try to get the string formatting template for this format + str_date_fmt, loc_s = convert_strftime_format(date_format) + except UnsupportedStrFmtDirective: + # Unsupported directive: fallback to standard `strftime` + pass + return lambda x: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format + x, + nat_rep=nat_rep, + date_format=date_format, + str_date_fmt=str_date_fmt, + loc_s=loc_s, ) else: + # Relies on datetime.str, which is fast already return lambda x: _format_datetime64(x, nat_rep=nat_rep) @@ -1799,6 +1840,7 @@ def get_format_datetime64_from_values( ido = is_dates_only(values) if ido: + # only dates and no timezone: provide a default format return date_format or "%Y-%m-%d" return date_format @@ -1807,9 +1849,13 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" values = self.values.astype(object) - ido = is_dates_only(values) + # When there is a timezone `is_dates_only` always returns `False` since dates + # are not universal dates but 00:00:00 timestamps in the given timezone. + assert not is_dates_only(values) formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format + is_dates_only=False, + date_format=self.date_format, + fast_strftime=self.fast_strftime, ) fmt_values = [formatter(x) for x in values] @@ -1829,6 +1875,7 @@ def __init__( self.box = box def _format_strings(self) -> list[str]: + # Note: `get_format_timedelta64` uses fast formatting formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) @@ -1872,6 +1919,8 @@ def _formatter(x): if not isinstance(x, Timedelta): x = Timedelta(x) + + # Note: this does not use strftime but string formatting (faster) result = x._repr_base(format=format) if box: result = f"'{result}'" diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 701642ad2cfe2..f2e95b763c7b4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1836,7 +1836,20 @@ def __init__(self, *args, **kwargs) -> None: # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) - sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + # sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + def _adapt(dt): + if dt.tzinfo is None: + # This is faster than strftime + return "%02d:%02d:%02d.%06d" % ( + dt.hour, + dt.minute, + dt.second, + dt.microsecond, + ) + else: + return dt.strftime("%H:%M:%S.%f") + + sqlite3.register_adapter(time, _adapt) super().__init__(*args, **kwargs) def sql_schema(self): diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 3454c8bb90941..b78df1041d399 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -207,6 +207,22 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp + # same with dates only: since there is a timezone, dates become datetimes + idx = date_range("2011-01-01", freq="D", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00, " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00, " + "2011-01-05 00:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 00:00:00-05:00, 2011-01-02 00:00:00-05:00,\n" + " " + "2011-01-03 00:00:00-05:00, 2011-01-04 00:00:00-05:00,\n" + " " + "2011-01-05 00:00:00-05:00]" + ) + assert repr(c) == exp + def test_categorical_repr_datetime_ordered(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index b7874d51b6f33..8f20806bf85b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -993,13 +993,16 @@ def test_to_csv_compression(self, df, encoding, compression): with tm.decompress_file(filename, compression) as fh: tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - def test_to_csv_date_format(self, datetime_frame): + @pytest.mark.parametrize("fast_strftime", (True, False)) + def test_to_csv_date_format(self, datetime_frame, fast_strftime): with tm.ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = datetime_frame.index datetime_frame = DataFrame( {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index ) - datetime_frame.to_csv(path, date_format="%Y%m%d") + datetime_frame.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1013,7 +1016,9 @@ def test_to_csv_date_format(self, datetime_frame): tm.assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format="%Y-%m-%d") + datetime_frame.to_csv( + path, date_format="%Y-%m-%d", fast_strftime=fast_strftime + ) # Check that the data was put in the specified format test = read_csv(path, index_col=0) @@ -1028,7 +1033,9 @@ def test_to_csv_date_format(self, datetime_frame): # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + datetime_frame_columns.to_csv( + path, date_format="%Y%m%d", fast_strftime=fast_strftime + ) test = read_csv(path, index_col=0) @@ -1047,14 +1054,15 @@ def test_to_csv_date_format(self, datetime_frame): ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format="%Y-%m-%d") + nat_frame.to_csv(path, date_format="%Y-%m-%d", fast_strftime=fast_strftime) test = read_csv(path, parse_dates=[0, 1], index_col=0) tm.assert_frame_equal(test, nat_frame) + @pytest.mark.parametrize("fast_strftime", (True, False)) @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) - def test_to_csv_with_dst_transitions(self, td): + def test_to_csv_with_dst_transitions(self, td, fast_strftime): with tm.ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions @@ -1069,7 +1077,7 @@ def test_to_csv_with_dst_transitions(self, td): i = i._with_freq(None) # freq is not preserved by read_csv time_range = np.array(range(len(i)), dtype="int64") df = DataFrame({"A": time_range}, index=i) - df.to_csv(path, index=True) + df.to_csv(path, index=True, fast_strftime=fast_strftime) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 9ab3e4cf6afac..f70698deb4883 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,7 +1,10 @@ """ Test output formatting for Series/DataFrame, including to_string & reprs """ -from datetime import datetime +from datetime import ( + datetime, + time, +) from io import StringIO import itertools from operator import methodcaller @@ -11,12 +14,14 @@ from shutil import get_terminal_size import sys import textwrap +from timeit import Timer import dateutil import numpy as np import pytest import pytz +from pandas._libs.tslibs import convert_strftime_format from pandas.compat import ( IS64, is_platform_windows, @@ -39,6 +44,7 @@ set_option, ) import pandas._testing as tm +from pandas.core.tools.datetimes import fast_strftime import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing @@ -46,6 +52,13 @@ use_32bit_repr = is_platform_windows() or not IS64 +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -3206,6 +3219,72 @@ def test_period_custom(self): assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" + def test_period_locale(self): + """Test that `convert_strftime_format` relies on runtime locale + + If this test fails, all tests using %p format strftime will fail when + the runtime locale is different from the compile-time one. + """ + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Use the function + str_tmp, loc_s = convert_strftime_format("%p") + assert str_tmp == "%(ampm)s" + + # Now what about the classes ? + # Timestamp + am_ts = Timestamp(2020, 1, 1, 1) + assert am_local == am_ts.strftime("%p") + assert am_local == am_ts.fast_strftime(str_tmp, loc_s) + pm_ts = Timestamp(2020, 1, 1, 13) + assert pm_local == pm_ts.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) + + # Period + am_per = pd.Period("2018-03-11 01:00", freq="H") + assert am_local == am_per.strftime("%p") + assert am_local == am_per.fast_strftime(str_tmp, loc_s) + pm_per = pd.Period("2018-03-11 13:00", freq="H") + assert pm_local == pm_per.strftime("%p") + assert pm_local == pm_ts.fast_strftime(str_tmp, loc_s) + + @pytest.mark.parametrize("fast_strftime", (False, True)) + def test_period_custom(self, fast_strftime): + # GH46252 + # fmt: off + + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # 3 digits + p = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime + ) + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123000 ns=123000000)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=124 us=124000 ns=124000000)" + + # 6 digits + p = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime + ) + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456000)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123457 ns=123457000)" + + # 9 digits + p = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + formatted = p.format( + date_format="%y %I:%M:%S%p (ms=%l us=%u ns=%n)", + fast_strftime=fast_strftime + ) + assert formatted[0] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456789)" + assert formatted[1] == f"03 12:01:01{pm_local} (ms=123 us=123456 ns=123456790)" + # fmt: on + def test_period_tz(self): # Formatting periods created from a datetime with timezone. @@ -3231,6 +3310,45 @@ def test_datetime(self): assert formatted[0] == "2003-01-01 12:00:00" assert formatted[1] == "NaT" + def test_datetime_tz(self): + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + # If tz is currently set as utc, we'll see 2012 + assert dt.format()[0] == "2012-12-31 23:00:00+00:00" + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert dt.format()[0] == "2013-01-01 00:00:00+01:00" + + def test_datetime_tz_custom(self): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC + dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) + + # If tz is currently set as utc, we'll see 2012 + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2012-12-31__foo__23:00:00" + ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2012-12-31__foo__11:00:00{pm_local}" + ) + + # If tz is currently set as paris, we'll see 2013 + dt = dt.tz_convert("Europe/Paris") + assert ( + dt.format(date_format="%Y-%m-%d__foo__%H:%M:%S")[0] + == "2013-01-01__foo__00:00:00" + ) + # same with fancy format + assert ( + dt.format(date_format="20%y-%m-%d__foo__%I:%M:%S%p")[0] + == f"2013-01-01__foo__12:00:00{am_local}" + ) + def test_date(self): formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() assert formatted[0] == "2003-01-01" @@ -3311,6 +3429,118 @@ def test_nat_representations(self): assert f(NaT) == "NaT" +class TestDatetimeFastFormatter: + """ + Test that the new `fast_strftime` mode improves formatting perf for all + kind of date/time objects. + """ + + @pytest.mark.parametrize( + "strftime_format", + ( + "%Y-%m-%d %H:%M:%S", + "%Y%%Y", + "%Y{%Y}", + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + ) + @pytest.mark.parametrize("new_style", (False, True)) + def test_fast_strftime_basic(self, strftime_format, new_style): + """ + Test that formatting standard `datetime` objects with our utils works + as good as strftime. + """ + + # create a datetime instance + dt = datetime.now() + + # strftime + strftime_res = dt.strftime(strftime_format) + + # get the formatting string + # fmt: off + style_format, loc_s = convert_strftime_format( + strftime_format, new_style_fmt=new_style + ) + # fmt: on + + # common dict used for formatting + fmt_dct = { + "year": dt.year, + "shortyear": dt.year % 100, + "month": dt.month, + "day": dt.day, + "hour": dt.hour, + "hour12": 12 if dt.hour in (0, 12) else (dt.hour % 12), + "ampm": loc_s.pm if (dt.hour // 12) else loc_s.am, + "min": dt.minute, + "sec": dt.second, + "us": dt.microsecond, + } + + # apply it and check the output + if new_style: + res = style_format.format(**fmt_dct) + else: + res = style_format % fmt_dct + assert res == strftime_res + + # fast_strftime is a shortcut function for the above + # but it would actually only be fast if it could receive lists :) + res2 = fast_strftime(dt, strftime_format, new_style_fmt=new_style) + assert res2 == res + + def test_bad_strftime_directive(self): + """Test what happens in case of bad `date_format` directive.""" + + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) + + # This does not raise any error, while %D is not a correct directive ! + x.dt.strftime(date_format="%Y-%M-%D___", fast_strftime=False) + + # We align with the same behaviour + x.dt.strftime(date_format="%Y-%M-%D___") + + @pytest.mark.parametrize( + "date_format", + ( + # note: "%Y-%m-%d %H:%M:%S and "%Y-%m-%d %H:%M:%S.%f + # are always accelerated (hardcoded) + "%Y-%m-%d__foo__%H:%M:%S", + "%Y %Y", + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + ) + def test_perf_datetime64_strftime(self, date_format): + x = Series(date_range("20130101 09:00:00", periods=100, freq="min")) + # res = x.dt.strftime(date_format=date_format) + # slow_res = x.dt.strftime(date_format=date_format, fast_strftime=False) + + glob = globals() + glob.update(locals()) + fast_best = min( + Timer("x.dt.strftime(date_format=date_format)", globals=glob).repeat(3, 100) + ) + strftime_best = min( + Timer( + "x.dt.strftime(date_format=date_format, fast_strftime=False)", + globals=glob, + ).repeat(3, 100) + ) + assert fast_best < strftime_best # much better + + # How many alternative are worth doing here ? + # probably datetime, date, period ? + + # fast_fmt = fmt.Datetime64Formatter(x) + # assert fast_fmt.fast_strftime is True + # fast_result = fast_fmt.get_result() + # slow_result = fmt.Datetime64Formatter(x, fast_strftime=False).get_result() + # for i in range(100): + # assert fast_result[i].strip() == "2013-01-01 00:00:00" + # assert slow_result[i].strip() == "2013-01-01 00:00:00" + + def test_format_percentiles(): result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) expected = ["1.999%", "2.001%", "50%", "66.667%", "99.99%"] diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index d3f8e27c47e98..d600074dbe743 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -284,6 +284,69 @@ def test_to_csv_date_format(self): df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + def test_to_csv_datetime_format_index(self): + """Test that formatting also works for datetime index""" + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_sec = df_sec.set_index("A") + + # default date_format + res = df_sec.to_csv() + expected_rows = [ + "A", + "2013-01-01 00:00:00", + "2013-01-01 00:00:01", + "2013-01-01 00:00:02", + "2013-01-01 00:00:03", + "2013-01-01 00:00:04", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + # custom date_format + res = df_sec.to_csv(date_format="%Y-%m-%d %H:%M:%S.%f") + expected_rows = [ + "A", + "2013-01-01 00:00:00.000000", + "2013-01-01 00:00:01.000000", + "2013-01-01 00:00:02.000000", + "2013-01-01 00:00:03.000000", + "2013-01-01 00:00:04.000000", + ] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_sec + + def test_to_csv_period_format_index(self): + """Test that formatting also works for period index""" + # same for periods + df_month = DataFrame({"A": pd.period_range("20130101", periods=5, freq="m")}) + df_month = df_month.set_index("A") + + # default date_format + res = df_month.to_csv() + expected_rows = [ + "A", + "2013-01", + "2013-02", + "2013-03", + "2013-04", + "2013-05", + ] + expected_default_mon = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_default_mon + + # custom format + res = df_month.to_csv(date_format="%F : %q") + expected_rows = [ + "A", + "2013 : 1", + "2013 : 1", + "2013 : 1", + "2013 : 2", + "2013 : 2", + ] + expected_ymdhms_month = tm.convert_rows_list_to_csv_str(expected_rows) + assert res == expected_ymdhms_month + def test_to_csv_different_datetime_formats(self): # GH#21734 df = DataFrame( diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 873103b01f64d..2461739cdd876 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -190,7 +190,17 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize( "klass,expected", [ - (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), + ( + Timestamp, + [ + "fast_strftime", + "freqstr", + "normalize", + "to_julian_date", + "to_period", + "tz", + ], + ), ( Timedelta, [ diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 755ac3d144246..10446a2e88288 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -17,6 +17,7 @@ def test_namespace(): "parsing", "period", "strptime", + "strftime", "vectorized", "timedeltas", "timestamps", @@ -33,6 +34,8 @@ def test_namespace(): "OutOfBoundsDatetime", "OutOfBoundsTimedelta", "Period", + "convert_strftime_format", + "UnsupportedStrFmtDirective", "IncompatibleFrequency", "Resolution", "Tick",