From 01b966716724587f2fc6070fea63aa68af0f57d0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 12 Apr 2022 23:14:17 +0200 Subject: [PATCH 01/34] Performance improvement in :class:`BusinessHour`, ``repr`` is now 4 times faster ! Related to #44764 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index dbf9547f561d2..f205033463df9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -385,6 +385,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) +- Performance improvement in :class:`BusinessHour`, ``repr`` is now 4 times faster ! (related to :issue:`44764`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index fef98199d3dbc..089d8cb1e2396 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1566,8 +1566,10 @@ cdef class BusinessHour(BusinessMixin): def _repr_attrs(self) -> str: out = super()._repr_attrs() + # Use python string formatting to be faster than strftime + # f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' hours = ",".join( - f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' + f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}' for st, en in zip(self.start, self.end) ) attrs = [f"{self._prefix}={hours}"] From 139f229a93898172974827eddb7280a31183b99c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 12 Apr 2022 23:37:42 +0200 Subject: [PATCH 02/34] For the record: switching the repr in TimeDelta to old-style formatting --- pandas/_libs/tslibs/timedeltas.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 627006a7f32c0..87f4ce8785c5d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1171,32 +1171,32 @@ cdef class _Timedelta(timedelta): sign = " " if format == 'all': - fmt = ("{days} days{sign}{hours:02}:{minutes:02}:{seconds:02}." - "{milliseconds:03}{microseconds:03}{nanoseconds:03}") + fmt = ("%(days)s days%(sign)s%(hours)02d:%(minutes)02d:%(seconds)02d." + "%(milliseconds)03d%(microseconds)03d%(nanoseconds)03d") else: # if we have a partial day subs = (self._h or self._m or self._s or self._ms or self._us or self._ns) if self._ms or self._us or self._ns: - seconds_fmt = "{seconds:02}.{milliseconds:03}{microseconds:03}" + seconds_fmt = "%(seconds)02d.%(milliseconds)03d%(microseconds)03d" if self._ns: # GH#9309 - seconds_fmt += "{nanoseconds:03}" + seconds_fmt += "%(nanoseconds)03d" else: - seconds_fmt = "{seconds:02}" + seconds_fmt = "%(seconds)02d" if format == 'sub_day' and not self._d: - fmt = "{hours:02}:{minutes:02}:" + seconds_fmt + fmt = "%(hours)02d:%(minutes)02d:" + seconds_fmt elif subs or format == 'long': - fmt = "{days} days{sign}{hours:02}:{minutes:02}:" + seconds_fmt + fmt = "%(days)s days%(sign)s%(hours)02d:%(minutes)02d:" + seconds_fmt else: - fmt = "{days} days" + fmt = "%(days)s days" comp_dict = self.components._asdict() comp_dict['sign'] = sign - return fmt.format(**comp_dict) + return fmt % comp_dict def __repr__(self) -> str: repr_based = self._repr_base(format='long') From 75e8fd6a9595fc62b49c57168bf758cbb4da1156 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 12 Apr 2022 23:39:36 +0200 Subject: [PATCH 03/34] added comment --- pandas/_libs/tslibs/timedeltas.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 87f4ce8785c5d..94852d95ea073 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1196,6 +1196,7 @@ cdef class _Timedelta(timedelta): comp_dict = self.components._asdict() comp_dict['sign'] = sign + # Note: using old-style formatting is very slightly faster than new-style return fmt % comp_dict def __repr__(self) -> str: From 0beed46f81aa1b3a0dc809c3ec5f51cc649f6d71 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 12 Apr 2022 23:40:11 +0200 Subject: [PATCH 04/34] Minor: comment on speed --- pandas/_libs/tslibs/timestamps.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 2afceb827e49a..fd0e86741d8a7 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -819,7 +819,7 @@ cdef class _Timestamp(ABCTimestamp): @property def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but - # the datetime strftime() methods require year >= 1900 + # the datetime strftime() methods require year >= 1900 and is slower return f'{self.year}-{self.month:02d}-{self.day:02d}' @property From cdffff890d6c652e208722953804f6f423a901ab Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 12 Apr 2022 23:41:36 +0200 Subject: [PATCH 05/34] added two comments for maintenance, about speed --- pandas/_libs/tslib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9dfc438319148..0ed887bcbd1f6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -170,6 +170,7 @@ def format_array_from_datetime( ts = Timestamp(val, tz=tz) if format is None: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) else: @@ -178,6 +179,7 @@ def format_array_from_datetime( try: result[i] = ts.strftime(format) except ValueError: + # Use datetime.str, that returns ts.isoformat(sep=' ') result[i] = str(ts) return result From 903d08cb773759c5d4b0341df8e0bf654fdabec0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 00:00:39 +0200 Subject: [PATCH 06/34] Performance improvement in :class:`DatetimeArray` and :class:`DatetimeIndex`: string formatting is now up to 80% faster (as fast as default) when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. See #44764 --- doc/source/whatsnew/v1.5.0.rst | 3 ++- pandas/_libs/tslib.pyx | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index f205033463df9..a0f3288cb82d1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -385,7 +385,8 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) -- Performance improvement in :class:`BusinessHour`, ``repr`` is now 4 times faster ! (related to :issue:`44764`) +- Performance improvement in :class:`BusinessHour`, ``str`` and ``repr`` are now 4 times faster ! (related to :issue:`44764`) +- Performance improvement in :class:`DatetimeArray` and :class:`DatetimeIndex`: string formatting is now up to 80% faster (as fast as default) when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0ed887bcbd1f6..306c116726c29 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -145,6 +145,16 @@ def format_array_from_datetime( consider_values //= 1000 show_ms = (consider_values % 1000).any() + elif format == "%Y-%m-%d %H:%M:%S": + # Same format as default, but with hardcoded precision (s) + basic_format = True + show_ns = show_us = show_ms = False + + elif format == "%Y-%m-%d %H:%M:%S.%f": + # Same format as default, but with hardcoded precision (us) + basic_format = show_us = True + show_ns = show_ms = False + for i in range(N): val = values[i] From f6e4e5a3a6e64569dd93e8ea8f9eefae6da556f0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 00:04:32 +0200 Subject: [PATCH 07/34] A few comments related to time formatting --- pandas/core/arrays/period.py | 3 +++ pandas/core/arrays/timedeltas.py | 1 + 2 files changed, 4 insertions(+) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fa543f6773634..123e2928c5cea 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -641,11 +641,14 @@ def _format_native_types( """ values = self.astype(object) + # Create the formatter function if date_format: formatter = lambda per: per.strftime(date_format) else: + # Uses `_Period.str>format_period` formatter = lambda per: str(per) + # Apply the formatter to all values in the array, possibly with a mask if self._hasna: mask = self._isnan values[mask] = na_rep diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index dc63cd92bbb2b..c5dcc5348ea17 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -434,6 +434,7 @@ def _format_native_types( ) -> np.ndarray: from pandas.io.formats.format import get_format_timedelta64 + # Relies on TimeDelta._repr_base formatter = get_format_timedelta64(self._ndarray, na_rep) return np.array([formatter(x) for x in self._ndarray]) From 9851b36982d5e0fca2754f999af3040bb7f66a5f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 00:31:28 +0200 Subject: [PATCH 08/34] Performance improvement in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:class:`ExcelFormatter`): processing dates can be up to 4% faster. (related to :issue:`44764`) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/excel/_odswriter.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a0f3288cb82d1..7eff00da812dc 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -387,6 +387,7 @@ Performance improvements - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - Performance improvement in :class:`BusinessHour`, ``str`` and ``repr`` are now 4 times faster ! (related to :issue:`44764`) - Performance improvement in :class:`DatetimeArray` and :class:`DatetimeIndex`: string formatting is now up to 80% faster (as fast as default) when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) +- Performance improvement in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:class:`ExcelFormatter`): processing dates can be up to 4% faster. (related to :issue:`44764`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f5367df6f228d..f56e0ec7dec28 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -189,15 +189,15 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: value = str(val).lower() pvalue = str(val).upper() if isinstance(val, datetime.datetime): - value = val.isoformat() - pvalue = val.strftime("%c") + value = val.isoformat() # fast formatting + pvalue = val.strftime("%c") # slow but locale-dependent return ( pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) elif isinstance(val, datetime.date): - value = val.strftime("%Y-%m-%d") - pvalue = val.strftime("%x") + value = f"{val.year}-{val.month:02d}-{val.day:02d}" # fast formatting + pvalue = val.strftime("%x") # slow but locale-dependent return ( pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), From d302fe1ed3043db1d8cf1a58cf9a7806fd43b03f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 00:42:28 +0200 Subject: [PATCH 09/34] Added maintenance comments --- pandas/io/formats/format.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ea938c924ae0c..94d9b001f7603 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1759,6 +1759,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep + # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ') + # so it already uses string formatting rather than strftime (faster). return str(x) @@ -1773,12 +1775,15 @@ def _format_datetime64_dateonly( if date_format: return x.strftime(date_format) else: + # Timestamp._date_repr relies on string formatting (faster than strftime) return x._date_repr def get_format_datetime64( is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: + """Return a formatter callable taking a datetime64 as input and providing + a string as output""" if is_dates_only: return lambda x: _format_datetime64_dateonly( @@ -1799,6 +1804,7 @@ def get_format_datetime64_from_values( ido = is_dates_only(values) if ido: + # Only dates and no timezone: provide a default format return date_format or "%Y-%m-%d" return date_format @@ -1872,6 +1878,8 @@ def _formatter(x): if not isinstance(x, Timedelta): x = Timedelta(x) + + # Timedelta._repr_base uses string formatting (faster than strftime) result = x._repr_base(format=format) if box: result = f"'{result}'" From 54f51f004425765ebd0c690b82e5ef6a734f3e44 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 01:08:58 +0200 Subject: [PATCH 10/34] Performance improvement in `Series.to_sql` and `DataFrame.to_sql` (`SQLiteTable`): processing time arrays can be up to 65% faster ! (related to #44764) --- doc/source/whatsnew/v1.5.0.rst | 4 +++- pandas/io/sql.py | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7eff00da812dc..753e39086e5db 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -387,7 +387,9 @@ Performance improvements - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - Performance improvement in :class:`BusinessHour`, ``str`` and ``repr`` are now 4 times faster ! (related to :issue:`44764`) - Performance improvement in :class:`DatetimeArray` and :class:`DatetimeIndex`: string formatting is now up to 80% faster (as fast as default) when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) -- Performance improvement in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:class:`ExcelFormatter`): processing dates can be up to 4% faster. (related to :issue:`44764`) +- Performance improvement in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:class:`ExcelFormatter`): processing date cells can be up to 4% faster. (related to +:issue:`44764`) +- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`): processing time arrays can be up to 65% faster ! (related to :issue:`44764`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 701642ad2cfe2..5adb0048ef692 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1836,7 +1836,20 @@ def __init__(self, *args, **kwargs) -> None: # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) - sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) + def _adapt(dt): + if dt.tzinfo is None: + # This is faster than strftime + return "%02d:%02d:%02d.%06d" % ( + dt.hour, + dt.minute, + dt.second, + dt.microsecond, + ) + else: + # Slow TODO we can probably find a way to use string formatting too + return dt.strftime("%H:%M:%S.%f") + + sqlite3.register_adapter(time, _adapt) super().__init__(*args, **kwargs) def sql_schema(self): From 6d60a04e9427cf67980a2d2761055a425fa08e7d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 09:52:06 +0200 Subject: [PATCH 11/34] Revert "For the record: switching the repr in TimeDelta to old-style formatting" This reverts commit 139f229a --- pandas/_libs/tslibs/timedeltas.pyx | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f9e8b799879c0..7979feb076c6e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1173,33 +1173,32 @@ cdef class _Timedelta(timedelta): sign = " " if format == 'all': - fmt = ("%(days)s days%(sign)s%(hours)02d:%(minutes)02d:%(seconds)02d." - "%(milliseconds)03d%(microseconds)03d%(nanoseconds)03d") + fmt = ("{days} days{sign}{hours:02}:{minutes:02}:{seconds:02}." + "{milliseconds:03}{microseconds:03}{nanoseconds:03}") else: # if we have a partial day subs = (self._h or self._m or self._s or self._ms or self._us or self._ns) if self._ms or self._us or self._ns: - seconds_fmt = "%(seconds)02d.%(milliseconds)03d%(microseconds)03d" + seconds_fmt = "{seconds:02}.{milliseconds:03}{microseconds:03}" if self._ns: # GH#9309 - seconds_fmt += "%(nanoseconds)03d" + seconds_fmt += "{nanoseconds:03}" else: - seconds_fmt = "%(seconds)02d" + seconds_fmt = "{seconds:02}" if format == 'sub_day' and not self._d: - fmt = "%(hours)02d:%(minutes)02d:" + seconds_fmt + fmt = "{hours:02}:{minutes:02}:" + seconds_fmt elif subs or format == 'long': - fmt = "%(days)s days%(sign)s%(hours)02d:%(minutes)02d:" + seconds_fmt + fmt = "{days} days{sign}{hours:02}:{minutes:02}:" + seconds_fmt else: - fmt = "%(days)s days" + fmt = "{days} days" comp_dict = self.components._asdict() comp_dict['sign'] = sign - # Note: using old-style formatting is very slightly faster than new-style - return fmt % comp_dict + return fmt.format(**comp_dict) def __repr__(self) -> str: repr_based = self._repr_base(format='long') From 705471fa6d34c51dcb3b680ed493deac637f8fb9 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 09:55:17 +0200 Subject: [PATCH 12/34] Updated what's new wrt code review --- doc/source/whatsnew/v1.5.0.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c0019cb294806..de28eba195fce 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -452,11 +452,9 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) -- Performance improvement in :class:`BusinessHour`, ``str`` and ``repr`` are now 4 times faster ! (related to :issue:`44764`) -- Performance improvement in :class:`DatetimeArray` and :class:`DatetimeIndex`: string formatting is now up to 80% faster (as fast as default) when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) -- Performance improvement in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:class:`ExcelFormatter`): processing date cells can be up to 4% faster. (related to -:issue:`44764`) -- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`): processing time arrays can be up to 65% faster ! (related to :issue:`44764`) +- Performance improvement in :class:`BusinessHour` ``str`` and ``repr``. (related to :issue:`44764`) +- Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) +- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (related to :issue:`44764`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: From a5203ff3fd8222c8991b584c16e60920260085f3 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 10:50:23 +0200 Subject: [PATCH 13/34] Fixed what's new error --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index de28eba195fce..dca862e888eb2 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -455,6 +455,7 @@ Performance improvements - Performance improvement in :class:`BusinessHour` ``str`` and ``repr``. (related to :issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (related to :issue:`44764`) +- .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: From 1c00b3a62d9a6a44bbd9db57041592020c819b65 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 10:55:13 +0200 Subject: [PATCH 14/34] Simplified sqlite adapter for time to handle both with and without timezone. Made it faster using f-string --- pandas/io/sql.py | 17 ++++------------- pandas/tests/io/test_sql.py | 12 ++++++++++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5adb0048ef692..e56203bb88547 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1836,20 +1836,11 @@ def __init__(self, *args, **kwargs) -> None: # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) - def _adapt(dt): - if dt.tzinfo is None: - # This is faster than strftime - return "%02d:%02d:%02d.%06d" % ( - dt.hour, - dt.minute, - dt.second, - dt.microsecond, - ) - else: - # Slow TODO we can probably find a way to use string formatting too - return dt.strftime("%H:%M:%S.%f") + def _adapt_time(t): + # This is faster than strftime + return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}" - sqlite3.register_adapter(time, _adapt) + sqlite3.register_adapter(time, _adapt_time) super().__init__(*args, **kwargs) def sql_schema(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e28901fa1a1ed..850570f8497d6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2595,9 +2595,17 @@ def test_datetime_date(self): elif self.flavor == "mysql": tm.assert_frame_equal(res, df) - def test_datetime_time(self): + @pytest.mark.parametrize("tz_aware", [False, True]) + def test_datetime_time(self, tz_aware): # test support for datetime.time, GH #8341 - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + if not tz_aware: + tz_times = [time(9, 0, 0), time(9, 1, 30)] + else: + tz_dt = date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific") + tz_times = pd.Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + + df = DataFrame(tz_times, columns=["a"]) + assert df.to_sql("test_time", self.conn, index=False) == 2 res = read_sql_query("SELECT * FROM test_time", self.conn) if self.flavor == "sqlite": From 2fde23906ac0b3a493921601c0483b75b169deb1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Wed, 13 Apr 2022 10:57:59 +0200 Subject: [PATCH 15/34] Fixed test --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 850570f8497d6..d494e95646538 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2601,7 +2601,7 @@ def test_datetime_time(self, tz_aware): if not tz_aware: tz_times = [time(9, 0, 0), time(9, 1, 30)] else: - tz_dt = date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific") + tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") tz_times = pd.Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) df = DataFrame(tz_times, columns=["a"]) From cb8b6d2adc7449659ed68129c90d4205c434b17c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sylvain=20Mari=C3=A9?= Date: Wed, 13 Apr 2022 11:00:22 +0200 Subject: [PATCH 16/34] Update pandas/_libs/tslibs/offsets.pyx --- pandas/_libs/tslibs/offsets.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index c9d938e6bdeb2..f66d1a49c0a43 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1591,7 +1591,6 @@ cdef class BusinessHour(BusinessMixin): def _repr_attrs(self) -> str: out = super()._repr_attrs() # Use python string formatting to be faster than strftime - # f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' hours = ",".join( f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}' for st, en in zip(self.start, self.end) From 1e49c0505d99f9c4cd5bcc41173497ed6422cd06 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 12 May 2022 11:49:41 +0200 Subject: [PATCH 17/34] Fixed Flake8 warning --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d494e95646538..5e3f8974cd576 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2602,7 +2602,7 @@ def test_datetime_time(self, tz_aware): tz_times = [time(9, 0, 0), time(9, 1, 30)] else: tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") - tz_times = pd.Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) df = DataFrame(tz_times, columns=["a"]) From 405a5f63fb73bc0807bb6a70efd676321baa1e3a Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 12 May 2022 23:32:36 +0200 Subject: [PATCH 18/34] Added ASVs for strftime related perf --- asv_bench/benchmarks/tslibs/strftime.py | 121 ++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 asv_bench/benchmarks/tslibs/strftime.py diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py new file mode 100644 index 0000000000000..f08f47c074d6f --- /dev/null +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -0,0 +1,121 @@ +from inspect import getmembers, ismethod + +import numpy as np +import pandas as pd + +from pandas import offsets + +try: + import pandas.tseries.holiday +except ImportError: + pass + + +hcal = pandas.tseries.holiday.USFederalHolidayCalendar() +# These offsets currently raise a NotImplimentedError with .apply_index() +non_apply = [ + offsets.Day(), + offsets.BYearEnd(), + offsets.BYearBegin(), + offsets.BQuarterEnd(), + offsets.BQuarterBegin(), + offsets.BMonthEnd(), + offsets.BMonthBegin(), + offsets.CustomBusinessDay(), + offsets.CustomBusinessDay(calendar=hcal), + offsets.CustomBusinessMonthBegin(calendar=hcal), + offsets.CustomBusinessMonthEnd(calendar=hcal), + offsets.CustomBusinessMonthEnd(calendar=hcal), +] +other_offsets = [ + offsets.YearEnd(), + offsets.YearBegin(), + offsets.QuarterEnd(), + offsets.QuarterBegin(), + offsets.MonthEnd(), + offsets.MonthBegin(), + offsets.DateOffset(months=2, days=2), + offsets.BusinessHour(), + offsets.BusinessDay(), + offsets.SemiMonthEnd(), + offsets.SemiMonthBegin(), +] +offset_objs = non_apply + other_offsets + + +class DatetimeStrftime: + fname = "__test__.csv" + timeout = 1500 + params = [1000, 10000] + param_names = ["obs"] + + def setup(self, obs): + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = pd.DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) + + def time_frame_date_no_formatting(self, obs): + self.data["d"].astype(str) + + def time_frame_date_formatting(self, obs): + self.data["d"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_datetime_no_formatting(self, obs): + self.data["dt"].astype(str) + + def time_frame_datetime_formatting(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") + + def time_frame_datetime_formatting_with_float(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") + + def time_frame_datetime_formatting_date_only(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + + +class BusinessHourStrftime: + fname = "__test__.csv" + timeout = 1500 + params = ([1000, 10000], offset_objs) + param_names = ["obs", "offset"] + + def setup(self, obs, offset): + self.data = pd.DataFrame( + { + "off": [offset] * obs, + } + ) + + def time_frame_offset_str(self, obs, offset): + self.data["off"].apply(str) + + def time_frame_offset_repr(self, obs, offset): + self.data["off"].apply(repr) + + +if __name__ == '__main__': + # To debug this ASV benchmark, simply run this file with python + from itertools import product + for cls in (DatetimeStrftime, BusinessHourStrftime): + if len(cls.param_names) == 1: + all_params = [{cls.param_names[0]: p} for p in cls.params] + else: + all_params = [{n: p for n, p in zip(cls.param_names, ps)} + for ps in product(*cls.params)] + for kwargs in all_params: + kwargs_str = ','.join([f'{k}={v}' for k, v in kwargs.items()]) + print(f"Executing {cls} with {kwargs_str}") + o = cls() + o.setup(**kwargs) + for k, v in getmembers(o, predicate=ismethod): + if k != "setup": + print(f"Executing {v.__name__}({kwargs_str})") + v(**kwargs) + print(f"Executing {v.__name__}({kwargs_str}): DONE") + print(f"Executing {cls} with {kwargs_str}: DONE") From 76035d5d5634b951b077262956d135c901942029 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 12 May 2022 23:41:05 +0200 Subject: [PATCH 19/34] Removed all other offsets from the ASV as they are not relevant --- asv_bench/benchmarks/tslibs/strftime.py | 49 +++---------------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py index f08f47c074d6f..1ca148074c205 100644 --- a/asv_bench/benchmarks/tslibs/strftime.py +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -5,43 +5,6 @@ from pandas import offsets -try: - import pandas.tseries.holiday -except ImportError: - pass - - -hcal = pandas.tseries.holiday.USFederalHolidayCalendar() -# These offsets currently raise a NotImplimentedError with .apply_index() -non_apply = [ - offsets.Day(), - offsets.BYearEnd(), - offsets.BYearBegin(), - offsets.BQuarterEnd(), - offsets.BQuarterBegin(), - offsets.BMonthEnd(), - offsets.BMonthBegin(), - offsets.CustomBusinessDay(), - offsets.CustomBusinessDay(calendar=hcal), - offsets.CustomBusinessMonthBegin(calendar=hcal), - offsets.CustomBusinessMonthEnd(calendar=hcal), - offsets.CustomBusinessMonthEnd(calendar=hcal), -] -other_offsets = [ - offsets.YearEnd(), - offsets.YearBegin(), - offsets.QuarterEnd(), - offsets.QuarterBegin(), - offsets.MonthEnd(), - offsets.MonthBegin(), - offsets.DateOffset(months=2, days=2), - offsets.BusinessHour(), - offsets.BusinessDay(), - offsets.SemiMonthEnd(), - offsets.SemiMonthBegin(), -] -offset_objs = non_apply + other_offsets - class DatetimeStrftime: fname = "__test__.csv" @@ -82,20 +45,20 @@ def time_frame_datetime_formatting_date_only(self, obs): class BusinessHourStrftime: fname = "__test__.csv" timeout = 1500 - params = ([1000, 10000], offset_objs) - param_names = ["obs", "offset"] + params = [1000, 10000] + param_names = ["obs"] - def setup(self, obs, offset): + def setup(self, obs): self.data = pd.DataFrame( { - "off": [offset] * obs, + "off": [offsets.BusinessHour()] * obs, } ) - def time_frame_offset_str(self, obs, offset): + def time_frame_offset_str(self, obs): self.data["off"].apply(str) - def time_frame_offset_repr(self, obs, offset): + def time_frame_offset_repr(self, obs): self.data["off"].apply(repr) From a788135f57dbe014cac6bb9386a09276f0cbe0f4 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 12 May 2022 23:51:57 +0200 Subject: [PATCH 20/34] black and isort --- asv_bench/benchmarks/tslibs/strftime.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py index 1ca148074c205..a3e4e0a597fe8 100644 --- a/asv_bench/benchmarks/tslibs/strftime.py +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -1,8 +1,11 @@ -from inspect import getmembers, ismethod +from inspect import ( + getmembers, + ismethod, +) import numpy as np -import pandas as pd +import pandas as pd from pandas import offsets @@ -62,17 +65,20 @@ def time_frame_offset_repr(self, obs): self.data["off"].apply(repr) -if __name__ == '__main__': +if __name__ == "__main__": # To debug this ASV benchmark, simply run this file with python from itertools import product + for cls in (DatetimeStrftime, BusinessHourStrftime): if len(cls.param_names) == 1: all_params = [{cls.param_names[0]: p} for p in cls.params] else: - all_params = [{n: p for n, p in zip(cls.param_names, ps)} - for ps in product(*cls.params)] + all_params = [ + {n: p for n, p in zip(cls.param_names, ps)} + for ps in product(*cls.params) + ] for kwargs in all_params: - kwargs_str = ','.join([f'{k}={v}' for k, v in kwargs.items()]) + kwargs_str = ",".join([f"{k}={v}" for k, v in kwargs.items()]) print(f"Executing {cls} with {kwargs_str}") o = cls() o.setup(**kwargs) From 3f3725ef82182f1a23e0397e96db8772aafcb707 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 13 May 2022 22:10:43 +0200 Subject: [PATCH 21/34] Added default format for dates --- pandas/_libs/tslib.pyx | 54 ++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index cdc26a6fb72c0..27f27102d1fac 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -122,7 +122,7 @@ def format_array_from_datetime( int64_t val, ns, N = len(values) ndarray[int64_t] consider_values bint show_ms = False, show_us = False, show_ns = False - bint basic_format = False + bint basic_format = False, basic_format_day = False ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts @@ -130,36 +130,48 @@ def format_array_from_datetime( if na_rep is None: na_rep = 'NaT' - # if we don't have a format nor tz, then choose - # a format based on precision - basic_format = format is None and tz is None - if basic_format: - consider_values = values[values != NPY_NAT] - show_ns = (consider_values % 1000).any() + if tz is None: + # if we don't have a format nor tz, then choose + # a format based on precision + basic_format = format is None + if basic_format: + consider_values = values[values != NPY_NAT] + show_ns = (consider_values % 1000).any() - if not show_ns: - consider_values //= 1000 - show_us = (consider_values % 1000).any() - - if not show_ms: + if not show_ns: consider_values //= 1000 - show_ms = (consider_values % 1000).any() + show_us = (consider_values % 1000).any() + + if not show_ms: + consider_values //= 1000 + show_ms = (consider_values % 1000).any() - elif format == "%Y-%m-%d %H:%M:%S": - # Same format as default, but with hardcoded precision (s) - basic_format = True - show_ns = show_us = show_ms = False + elif format == "%Y-%m-%d %H:%M:%S": + # Same format as default, but with hardcoded precision (s) + basic_format = True + show_ns = show_us = show_ms = False - elif format == "%Y-%m-%d %H:%M:%S.%f": - # Same format as default, but with hardcoded precision (us) - basic_format = show_us = True - show_ns = show_ms = False + elif format == "%Y-%m-%d %H:%M:%S.%f": + # Same format as default, but with hardcoded precision (us) + basic_format = show_us = True + show_ns = show_ms = False + + elif format == "%Y-%m-%d": + # Default format for dates + basic_format_day = True for i in range(N): val = values[i] if val == NPY_NAT: result[i] = na_rep + elif basic_format_day: + + dt64_to_dtstruct(val, &dts) + res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' + + result[i] = res + elif basic_format: dt64_to_dtstruct(val, &dts) From 5eee0792929c1c197ece2cbfc582a0c8832e190d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 13 May 2022 22:20:32 +0200 Subject: [PATCH 22/34] FixedImproved ASVs --- asv_bench/benchmarks/tslibs/strftime.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py index a3e4e0a597fe8..7947be39287bb 100644 --- a/asv_bench/benchmarks/tslibs/strftime.py +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -26,23 +26,29 @@ def setup(self, obs): } ) - def time_frame_date_no_formatting(self, obs): + def time_frame_date_to_str(self, obs): self.data["d"].astype(str) - def time_frame_date_formatting(self, obs): + def time_frame_date_formatting_default(self, obs): self.data["d"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_datetime_no_formatting(self, obs): + def time_frame_date_formatting_custom(self, obs): + self.data["d"].dt.strftime(date_format="%Y---%m---%d") + + def time_frame_datetime_to_str(self, obs): self.data["dt"].astype(str) - def time_frame_datetime_formatting(self, obs): + def time_frame_datetime_formatting_default_date_only(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + + def time_frame_datetime_formatting_default(self, obs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") - def time_frame_datetime_formatting_with_float(self, obs): + def time_frame_datetime_formatting_default_with_float(self, obs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") - def time_frame_datetime_formatting_date_only(self, obs): - self.data["dt"].dt.strftime(date_format="%Y-%m-%d") + def time_frame_datetime_formatting_custom(self, obs): + self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") class BusinessHourStrftime: From 7fcb31fc1d815bd6925a71086585b5aa6dbfb002 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 15:07:52 +0200 Subject: [PATCH 23/34] Code review: removed __main__ in asv bench --- asv_bench/benchmarks/tslibs/strftime.py | 30 ------------------------- 1 file changed, 30 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py index 7947be39287bb..2560f93f8fd59 100644 --- a/asv_bench/benchmarks/tslibs/strftime.py +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -1,8 +1,3 @@ -from inspect import ( - getmembers, - ismethod, -) - import numpy as np import pandas as pd @@ -69,28 +64,3 @@ def time_frame_offset_str(self, obs): def time_frame_offset_repr(self, obs): self.data["off"].apply(repr) - - -if __name__ == "__main__": - # To debug this ASV benchmark, simply run this file with python - from itertools import product - - for cls in (DatetimeStrftime, BusinessHourStrftime): - if len(cls.param_names) == 1: - all_params = [{cls.param_names[0]: p} for p in cls.params] - else: - all_params = [ - {n: p for n, p in zip(cls.param_names, ps)} - for ps in product(*cls.params) - ] - for kwargs in all_params: - kwargs_str = ",".join([f"{k}={v}" for k, v in kwargs.items()]) - print(f"Executing {cls} with {kwargs_str}") - o = cls() - o.setup(**kwargs) - for k, v in getmembers(o, predicate=ismethod): - if k != "setup": - print(f"Executing {v.__name__}({kwargs_str})") - v(**kwargs) - print(f"Executing {v.__name__}({kwargs_str}): DONE") - print(f"Executing {cls} with {kwargs_str}: DONE") From abc0032ee104b704c562ae86759194bf15ad60e1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 15:14:55 +0200 Subject: [PATCH 24/34] code review: whatsnew --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6ec0cc4503825..6c3c930e7b9b9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -573,9 +573,9 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) -- Performance improvement in :class:`BusinessHour` ``str`` and ``repr``. (related to :issue:`44764`) +- Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) -- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (related to :issue:`44764`) +- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - .. --------------------------------------------------------------------------- From 956c297aa71f66daa85a5116a05ada98470eeb6d Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 15:25:46 +0200 Subject: [PATCH 25/34] Code review: added asserts to ensure basic_format and basic_format_day are exclusive --- pandas/_libs/tslib.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b0d67aded4c47..9670fb16051ee 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -168,6 +168,7 @@ def format_array_from_datetime( if val == NPY_NAT: result[i] = na_rep elif basic_format_day: + assert not basic_format dt64_to_dtstruct(val, &dts) res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' @@ -175,6 +176,7 @@ def format_array_from_datetime( result[i] = res elif basic_format: + assert not basic_format_day dt64_to_dtstruct(val, &dts) res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' From ce6e4ac164d65c565703b8d669798495518d6b6f Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 15:54:59 +0200 Subject: [PATCH 26/34] Code review: inline comment style --- pandas/io/excel/_odswriter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f56e0ec7dec28..a6e125f4b9f33 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -189,15 +189,19 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: value = str(val).lower() pvalue = str(val).upper() if isinstance(val, datetime.datetime): - value = val.isoformat() # fast formatting - pvalue = val.strftime("%c") # slow but locale-dependent + # Fast formatting + value = val.isoformat() + # Slow but locale-dependent + pvalue = val.strftime("%c") return ( pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) elif isinstance(val, datetime.date): - value = f"{val.year}-{val.month:02d}-{val.day:02d}" # fast formatting - pvalue = val.strftime("%x") # slow but locale-dependent + # Fast formatting + value = f"{val.year}-{val.month:02d}-{val.day:02d}" + # Slow but locale-dependent + pvalue = val.strftime("%x") return ( pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), From b54196f45f2f77e77b8e1c6401cf8b94ecdeacd6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 16:06:24 +0200 Subject: [PATCH 27/34] Attempt to improve SQL ASV (not tested) --- asv_bench/benchmarks/io/sql.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 3cfa28de78c90..accf5ff06275a 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -39,6 +39,8 @@ def setup(self, connection): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].date + self.df["time"] = self.df["datetime"].time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -53,7 +55,8 @@ class WriteSQLDtypes: params = ( ["sqlalchemy", "sqlite"], - ["float", "float_with_nan", "string", "bool", "int", "datetime"], + ["float", "float_with_nan", "string", "bool", "int", "date", "time", + "datetime"], ) param_names = ["connection", "dtype"] @@ -78,6 +81,8 @@ def setup(self, connection, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].date + self.df["time"] = self.df["datetime"].time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -105,6 +110,8 @@ def setup(self): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].date + self.df["time"] = self.df["datetime"].time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -122,7 +129,8 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + params = ["float", "float_with_nan", "string", "bool", "int", "date", + "time", "datetime"] param_names = ["dtype"] def setup(self, dtype): @@ -141,6 +149,8 @@ def setup(self, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["date"] = self.df["datetime"].date + self.df["time"] = self.df["datetime"].time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") From 8df827c594a05d868654a2a0d7f8a6ef5aca6bd0 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Sun, 15 May 2022 22:31:20 +0200 Subject: [PATCH 28/34] Fixed SQL ASV Bench. --- asv_bench/benchmarks/io/sql.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index accf5ff06275a..272ca8ae6a0d3 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -39,8 +39,8 @@ def setup(self, connection): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan - self.df["date"] = self.df["datetime"].date - self.df["time"] = self.df["datetime"].time + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -81,8 +81,8 @@ def setup(self, connection, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan - self.df["date"] = self.df["datetime"].date - self.df["time"] = self.df["datetime"].time + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -110,8 +110,8 @@ def setup(self): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan - self.df["date"] = self.df["datetime"].date - self.df["time"] = self.df["datetime"].time + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") @@ -149,8 +149,8 @@ def setup(self, dtype): index=tm.makeStringIndex(N), ) self.df.loc[1000:3000, "float_with_nan"] = np.nan - self.df["date"] = self.df["datetime"].date - self.df["time"] = self.df["datetime"].time + self.df["date"] = self.df["datetime"].dt.date + self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) self.df.to_sql(self.table_name, self.con, if_exists="replace") From 898c047b09550cc391245d51cfd476910979d9ee Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 16 May 2022 09:25:34 +0200 Subject: [PATCH 29/34] Black --- asv_bench/benchmarks/io/sql.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 272ca8ae6a0d3..fb8b7dafa0ade 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -55,8 +55,16 @@ class WriteSQLDtypes: params = ( ["sqlalchemy", "sqlite"], - ["float", "float_with_nan", "string", "bool", "int", "date", "time", - "datetime"], + [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ], ) param_names = ["connection", "dtype"] @@ -129,8 +137,16 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - params = ["float", "float_with_nan", "string", "bool", "int", "date", - "time", "datetime"] + params = [ + "float", + "float_with_nan", + "string", + "bool", + "int", + "date", + "time", + "datetime", + ] param_names = ["dtype"] def setup(self, dtype): From 1b991e5e5c40eea416cad5db4680893882ae72a5 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 30 May 2022 17:45:41 +0200 Subject: [PATCH 30/34] As per code review: moved assert outside of for loop --- pandas/_libs/tslib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9670fb16051ee..9a62ef3e94d63 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -162,13 +162,14 @@ def format_array_from_datetime( # Default format for dates basic_format_day = True + assert not (basic_format_day and basic_format) + for i in range(N): val = values[i] if val == NPY_NAT: result[i] = na_rep elif basic_format_day: - assert not basic_format dt64_to_dtstruct(val, &dts) res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' @@ -176,7 +177,6 @@ def format_array_from_datetime( result[i] = res elif basic_format: - assert not basic_format_day dt64_to_dtstruct(val, &dts) res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' From 3e42fd4fca50251b120e83d502284797b5288398 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Jun 2022 22:42:03 +0200 Subject: [PATCH 31/34] As per code review: removed 2 useless variables and edited a comment --- asv_bench/benchmarks/tslibs/strftime.py | 2 -- pandas/core/arrays/period.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/strftime.py b/asv_bench/benchmarks/tslibs/strftime.py index 2560f93f8fd59..ac1b7f65d2d90 100644 --- a/asv_bench/benchmarks/tslibs/strftime.py +++ b/asv_bench/benchmarks/tslibs/strftime.py @@ -5,7 +5,6 @@ class DatetimeStrftime: - fname = "__test__.csv" timeout = 1500 params = [1000, 10000] param_names = ["obs"] @@ -47,7 +46,6 @@ def time_frame_datetime_formatting_custom(self, obs): class BusinessHourStrftime: - fname = "__test__.csv" timeout = 1500 params = [1000, 10000] param_names = ["obs"] diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index d6d31d7354847..6d796647df410 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -646,7 +646,7 @@ def _format_native_types( if date_format: formatter = lambda per: per.strftime(date_format) else: - # Uses `_Period.str>format_period` + # Uses `_Period.str` which in turn uses `format_period` formatter = lambda per: str(per) # Apply the formatter to all values in the array, possibly with a mask From fc078b9d4896aa207b09ae03f819b98f08bec24c Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Tue, 7 Jun 2022 22:46:35 +0200 Subject: [PATCH 32/34] Fixed wrong merge --- pandas/_libs/tslib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8926198d96b0a..5cf21957f0aee 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -144,8 +144,8 @@ def format_array_from_datetime( if basic_format: reso_obj = get_resolution(values) show_ns = reso_obj == Resolution.RESO_NS - show_us = reso_obj == Resolution.RESO_US - show_ms = reso_obj == Resolution.RESO_MS + show_us = reso_obj == Resolution.RESO_US + show_ms = reso_obj == Resolution.RESO_MS elif format == "%Y-%m-%d %H:%M:%S": # Same format as default, but with hardcoded precision (s) From 32b03a87d947f6c6060813dffafa1bc598bb044b Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Jun 2022 13:22:41 +0200 Subject: [PATCH 33/34] Aligned the basic format day with new `pandas_datetime_to_datetimestruct` --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 262bb861c097e..31674f036d008 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -173,7 +173,7 @@ def format_array_from_datetime( result[i] = na_rep elif basic_format_day: - dt64_to_dtstruct(val, &dts) + pandas_datetime_to_datetimestruct(val, reso, &dts) res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' result[i] = res From 7adff9b3233c7072163532648521e43c46a6c64e Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Mon, 20 Jun 2022 22:44:07 +0200 Subject: [PATCH 34/34] Removed useless line --- pandas/_libs/tslib.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3e6dc2b8b4296..ee3964b892e2e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -184,8 +184,6 @@ def format_array_from_datetime( pandas_datetime_to_datetimestruct(val, reso, &dts) res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}' - res = res - elif basic_format: pandas_datetime_to_datetimestruct(val, reso, &dts) @@ -200,7 +198,6 @@ def format_array_from_datetime( elif show_ms: res += f'.{dts.us // 1000:03d}' - else: ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)