diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 3cfa28de78c90..fb8b7dafa0ade 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -39,6 +39,8 @@ def setup(self, connection): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -53,7 +55,16 @@ class WriteSQLDtypes: params = ( ["sqlalchemy", "sqlite"], - ["float", "float_with_nan", "string", "bool", "int", "datetime"], + [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ], ) param_names = ["connection", "dtype"] @@ -78,6 +89,8 @@ def setup(self, connection, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -105,6 +118,8 @@ def setup(self): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -122,7 +137,16 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + params = [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ] param_names = ["dtype"] def setup(self, dtype): @@ -141,6 +165,8 @@ def setup(self, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py new file mode 100644 index 0000000000000..ac1b7f65d2d90 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -0,0 +1,64 @@ +import numpy as np + +import pandas as pd +from pandas import offsets + + +class DatetimeStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = pd.DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) + + def time_frame_date_to_str(self, obs): + self.data["d"].astype(str) + + def time_frame_date_formatting_default(self, obs): + self.data["d"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_date_formatting_custom(self, obs): + self.data["d"].dt.strftime(date_format="%Y---%m---%d") + + def time_frame_datetime_to_str(self, obs): + self.data["dt"].astype(str) + + def time_frame_datetime_formatting_default_date_only(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_datetime_formatting_default(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_datetime_formatting_default_with_float(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + + def time_frame_datetime_formatting_custom(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + +class BusinessHourStrftime: + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + self.data = pd.DataFrame( + { + "off": [offsets.BusinessHour()] * obs, + } + ) + + def time_frame_offset_str(self, obs): + self.data["off"].apply(str) + + def time_frame_offset_repr(self, obs): + self.data["off"].apply(repr) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d237ee9921356..34f6159ec093f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -792,6 +792,10 @@ Performance improvements - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`) - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`) +- Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) +- Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) +- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) +- .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index dc7504b1073f5..ee3964b892e2e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -131,7 +131,7 @@ def format_array_from_datetime( cdef: int64_t val, ns, N = values.size bint show_ms = False, show_us = False, show_ns = False - bint basic_format = False + bint basic_format = False, basic_format_day = False _Timestamp ts object res npy_datetimestruct dts @@ -147,14 +147,31 @@ def format_array_from_datetime( if na_rep is None: na_rep = 'NaT' - # if we don't have a format nor tz, then choose - # a format based on precision - basic_format = format is None and tz is None - if basic_format: - reso_obj = get_resolution(values, reso=reso) - show_ns = reso_obj == Resolution.RESO_NS - show_us = reso_obj == Resolution.RESO_US - show_ms = reso_obj == Resolution.RESO_MS + if tz is None: + # if we don't have a format nor tz, then choose + # a format based on precision + basic_format = format is None + if basic_format: + reso_obj = get_resolution(values, reso=reso) + show_ns = reso_obj == Resolution.RESO_NS + show_us = reso_obj == Resolution.RESO_US + show_ms = reso_obj == Resolution.RESO_MS + + elif format == "%Y-%m-%d %H:%M:%S": + # Same format as default, but with hardcoded precision (s) + basic_format = True + show_ns = show_us = show_ms = False + + elif format == "%Y-%m-%d %H:%M:%S.%f": + # Same format as default, but with hardcoded precision (us) + basic_format = show_us = True + show_ns = show_ms = False + + elif format == "%Y-%m-%d": + # Default format for dates + basic_format_day = True + + assert not (basic_format_day and basic_format) for i in range(N): # Analogous to: utc_val = values[i] @@ -162,6 +179,11 @@ def format_array_from_datetime( if val == NPY_NAT: res = na_rep + elif basic_format_day: + + pandas_datetime_to_datetimestruct(val, reso, &dts) + res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' + elif basic_format: pandas_datetime_to_datetimestruct(val, reso, &dts) @@ -176,11 +198,11 @@ def format_array_from_datetime( elif show_ms: res += f'.{dts.us // 1000:03d}' - else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz) if format is None: + # Use datetime.str, that returns ts.isoformat(sep=' ') res = str(ts) else: @@ -190,6 +212,7 @@ def format_array_from_datetime( # Note: dispatches to pydatetime res = ts.strftime(format) except ValueError: + # Use datetime.str, that returns ts.isoformat(sep=' ') res = str(ts) # Note: we can index result directly instead of using PyArray_MultiIter_DATA diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f29e6a2d0d9ea..81b59db6f0e18 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1566,8 +1566,9 @@ cdef class BusinessHour(BusinessMixin): def _repr_attrs(self) -> str: out = super()._repr_attrs() + # Use python string formatting to be faster than strftime hours = ",".join( - f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' + f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}' for st, en in zip(self.start, self.end) ) attrs = [f"{self._prefix}={hours}"] diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index aedecc33ceee9..ae3ce46cbc3c8 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1014,7 +1014,7 @@ cdef class _Timestamp(ABCTimestamp): @property def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but - # the datetime strftime() methods require year >= 1900 + # the datetime strftime() methods require year >= 1900 and is slower return f'{self.year}-{self.month:02d}-{self.day:02d}' @property diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7e6ea07c154cd..4d97345912250 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -646,11 +646,14 @@ def _format_native_types( """ values = self.astype(object) + # Create the formatter function if date_format: formatter = lambda per: per.strftime(date_format) else: + # Uses `_Period.str` which in turn uses `format_period` formatter = lambda per: str(per) + # Apply the formatter to all values in the array, possibly with a mask if self._hasna: mask = self._isnan values[mask] = na_rep diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 74bd345fe4e18..0a43c93cb6c9b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -430,6 +430,7 @@ def _format_native_types( ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_timedelta64 + # Relies on TimeDelta._repr_base formatter = get_format_timedelta64(self._ndarray, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f5367df6f228d..a6e125f4b9f33 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -189,14 +189,18 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: value = str(val).lower() pvalue = str(val).upper() if isinstance(val, datetime.datetime): + # Fast formatting value = val.isoformat() + # Slow but locale-dependent pvalue = val.strftime("%c") return ( pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) elif isinstance(val, datetime.date): - value = val.strftime("%Y-%m-%d") + # Fast formatting + value = f"{val.year}-{val.month:02d}-{val.day:02d}" + # Slow but locale-dependent pvalue = val.strftime("%x") return ( pvalue, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 08deafce3af98..497523211f2d8 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1768,6 +1768,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep + # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ') + # so it already uses string formatting rather than strftime (faster). return str(x) @@ -1782,12 +1784,15 @@ def _format_datetime64_dateonly( if date_format: return x.strftime(date_format) else: + # Timestamp._date_repr relies on string formatting (faster than strftime) return x._date_repr def get_format_datetime64( is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: + """Return a formatter callable taking a datetime64 as input and providing + a string as output""" if is_dates_only: return lambda x: _format_datetime64_dateonly( @@ -1808,6 +1813,7 @@ def get_format_datetime64_from_values( ido = is_dates_only(values) if ido: + # Only dates and no timezone: provide a default format return date_format or "%Y-%m-%d" return date_format @@ -1881,6 +1887,8 @@ def _formatter(x): if not isinstance(x, Timedelta): x = Timedelta(x) + + # Timedelta._repr_base uses string formatting (faster than strftime) result = x._repr_base(format=format) if box: result = f"'{result}'" diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 24290b8370ed2..0121ed7241a57 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1848,7 +1848,11 @@ def __init__(self, *args, **kwargs) -> None: # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) - sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + def _adapt_time(t): + # This is faster than strftime + return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}" + + sqlite3.register_adapter(time, _adapt_time) super().__init__(*args, **kwargs) def sql_schema(self) -> str: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e28901fa1a1ed..5e3f8974cd576 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2595,9 +2595,17 @@ def test_datetime_date(self): elif self.flavor == "mysql": tm.assert_frame_equal(res, df) - def test_datetime_time(self): + @pytest.mark.parametrize("tz_aware", [False, True]) + def test_datetime_time(self, tz_aware): # test support for datetime.time, GH #8341 - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + if not tz_aware: + tz_times = [time(9, 0, 0), time(9, 1, 30)] + else: + tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") + tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + + df = DataFrame(tz_times, columns=["a"]) + assert df.to_sql("test_time", self.conn, index=False) == 2 res = read_sql_query("SELECT * FROM test_time", self.conn) if self.flavor == "sqlite":