Skip to content

PERF: Some faster date/time string formatting #46759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Jul 1, 2022
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
01b9667
Performance improvement in :class:`BusinessHour`, ``repr`` is now 4 t…
Apr 12, 2022
139f229
For the record: switching the repr in TimeDelta to old-style formatting
Apr 12, 2022
75e8fd6
added comment
Apr 12, 2022
0beed46
Minor: comment on speed
Apr 12, 2022
cdffff8
added two comments for maintenance, about speed
Apr 12, 2022
903d08c
Performance improvement in :class:`DatetimeArray` and :class:`Datetim…
Apr 12, 2022
f6e4e5a
A few comments related to time formatting
Apr 12, 2022
9851b36
Performance improvement in :meth:`Series.to_excel` and :meth:`DataFra…
Apr 12, 2022
d302fe1
Added maintenance comments
Apr 12, 2022
54f51f0
Performance improvement in `Series.to_sql` and `DataFrame.to_sql` (`S…
Apr 12, 2022
378e7f4
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Apr 12, 2022
6d60a04
Revert "For the record: switching the repr in TimeDelta to old-style …
Apr 13, 2022
705471f
Updated what's new wrt code review
Apr 13, 2022
a5203ff
Fixed what's new error
Apr 13, 2022
1c00b3a
Simplified sqlite adapter for time to handle both with and without ti…
Apr 13, 2022
2fde239
Fixed test
Apr 13, 2022
cb8b6d2
Update pandas/_libs/tslibs/offsets.pyx
smarie Apr 13, 2022
1e49c05
Fixed Flake8 warning
May 12, 2022
405a5f6
Added ASVs for strftime related perf
May 12, 2022
76035d5
Removed all other offsets from the ASV as they are not relevant
May 12, 2022
a788135
black and isort
May 12, 2022
3f3725e
Added default format for dates
May 13, 2022
5eee079
FixedImproved ASVs
May 13, 2022
165aefa
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
May 13, 2022
7fcb31f
Code review: removed __main__ in asv bench
May 15, 2022
abc0032
code review: whatsnew
May 15, 2022
956c297
Code review: added asserts to ensure basic_format and basic_format_da…
May 15, 2022
ce6e4ac
Code review: inline comment style
May 15, 2022
b54196f
Attempt to improve SQL ASV (not tested)
May 15, 2022
8df827c
Fixed SQL ASV Bench.
May 15, 2022
898c047
Black
May 16, 2022
1b991e5
As per code review: moved assert outside of for loop
May 30, 2022
0a6e8fb
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Jun 7, 2022
3e42fd4
As per code review: removed 2 useless variables and edited a comment
Jun 7, 2022
fc078b9
Fixed wrong merge
Jun 7, 2022
e410b31
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Jun 16, 2022
32b03a8
Aligned the basic format day with new `pandas_datetime_to_datetimestr…
Jun 16, 2022
6561e77
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Jun 20, 2022
7adff9b
Removed useless line
Jun 20, 2022
d104bb4
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Jul 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions asv_bench/benchmarks/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def setup(self, connection):
index=tm.makeStringIndex(N),
)
self.df.loc[1000:3000, "float_with_nan"] = np.nan
self.df["date"] = self.df["datetime"].dt.date
self.df["time"] = self.df["datetime"].dt.time
self.df["datetime_string"] = self.df["datetime"].astype(str)
self.df.to_sql(self.table_name, self.con, if_exists="replace")

Expand All @@ -53,7 +55,16 @@ class WriteSQLDtypes:

params = (
["sqlalchemy", "sqlite"],
["float", "float_with_nan", "string", "bool", "int", "datetime"],
[
"float",
"float_with_nan",
"string",
"bool",
"int",
"date",
"time",
"datetime",
],
)
param_names = ["connection", "dtype"]

Expand All @@ -78,6 +89,8 @@ def setup(self, connection, dtype):
index=tm.makeStringIndex(N),
)
self.df.loc[1000:3000, "float_with_nan"] = np.nan
self.df["date"] = self.df["datetime"].dt.date
self.df["time"] = self.df["datetime"].dt.time
self.df["datetime_string"] = self.df["datetime"].astype(str)
self.df.to_sql(self.table_name, self.con, if_exists="replace")

Expand Down Expand Up @@ -105,6 +118,8 @@ def setup(self):
index=tm.makeStringIndex(N),
)
self.df.loc[1000:3000, "float_with_nan"] = np.nan
self.df["date"] = self.df["datetime"].dt.date
self.df["time"] = self.df["datetime"].dt.time
self.df["datetime_string"] = self.df["datetime"].astype(str)
self.df.to_sql(self.table_name, self.con, if_exists="replace")

Expand All @@ -122,7 +137,16 @@ def time_read_sql_table_parse_dates(self):

class ReadSQLTableDtypes:

params = ["float", "float_with_nan", "string", "bool", "int", "datetime"]
params = [
"float",
"float_with_nan",
"string",
"bool",
"int",
"date",
"time",
"datetime",
]
param_names = ["dtype"]

def setup(self, dtype):
Expand All @@ -141,6 +165,8 @@ def setup(self, dtype):
index=tm.makeStringIndex(N),
)
self.df.loc[1000:3000, "float_with_nan"] = np.nan
self.df["date"] = self.df["datetime"].dt.date
self.df["time"] = self.df["datetime"].dt.time
self.df["datetime_string"] = self.df["datetime"].astype(str)
self.df.to_sql(self.table_name, self.con, if_exists="replace")

Expand Down
64 changes: 64 additions & 0 deletions asv_bench/benchmarks/tslibs/strftime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np

import pandas as pd
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for the late review; these benchmarks don't belong in the tslibs directory (unless you can re-write them to not depend on anything outside of tslibs). can you do a follow-up to move them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes sure, do you think that moving them one folder up would be enough ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would do it, yes. thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in #47665 . Let's wait for a successful build just in case..

from pandas import offsets


class DatetimeStrftime:
timeout = 1500
params = [1000, 10000]
param_names = ["obs"]

def setup(self, obs):
d = "2018-11-29"
dt = "2018-11-26 11:18:27.0"
self.data = pd.DataFrame(
{
"dt": [np.datetime64(dt)] * obs,
"d": [np.datetime64(d)] * obs,
"r": [np.random.uniform()] * obs,
}
)

def time_frame_date_to_str(self, obs):
self.data["d"].astype(str)

def time_frame_date_formatting_default(self, obs):
self.data["d"].dt.strftime(date_format="%Y-%m-%d")

def time_frame_date_formatting_custom(self, obs):
self.data["d"].dt.strftime(date_format="%Y---%m---%d")

def time_frame_datetime_to_str(self, obs):
self.data["dt"].astype(str)

def time_frame_datetime_formatting_default_date_only(self, obs):
self.data["dt"].dt.strftime(date_format="%Y-%m-%d")

def time_frame_datetime_formatting_default(self, obs):
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S")

def time_frame_datetime_formatting_default_with_float(self, obs):
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f")

def time_frame_datetime_formatting_custom(self, obs):
self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")


class BusinessHourStrftime:
timeout = 1500
params = [1000, 10000]
param_names = ["obs"]

def setup(self, obs):
self.data = pd.DataFrame(
{
"off": [offsets.BusinessHour()] * obs,
}
)

def time_frame_offset_str(self, obs):
self.data["off"].apply(str)

def time_frame_offset_repr(self, obs):
self.data["off"].apply(repr)
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,10 @@ Performance improvements
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
- Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`)
- Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does "time" arrays refer to datetime64 dtype?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, rather datetime.time (obtained from a datetime64 using .dt.time)

-

.. ---------------------------------------------------------------------------
.. _whatsnew_150.bug_fixes:
Expand Down
44 changes: 35 additions & 9 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def format_array_from_datetime(
cdef:
int64_t val, ns, N = len(values)
bint show_ms = False, show_us = False, show_ns = False
bint basic_format = False
bint basic_format = False, basic_format_day = False
ndarray[object] result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
_Timestamp ts
str res
Expand All @@ -140,20 +140,44 @@ def format_array_from_datetime(
if na_rep is None:
na_rep = 'NaT'

# if we don't have a format nor tz, then choose
# a format based on precision
basic_format = format is None and tz is None
if basic_format:
reso_obj = get_resolution(values, reso=reso)
show_ns = reso_obj == Resolution.RESO_NS
show_us = reso_obj == Resolution.RESO_US
show_ms = reso_obj == Resolution.RESO_MS
if tz is None:
# if we don't have a format nor tz, then choose
# a format based on precision
basic_format = format is None
if basic_format:
reso_obj = get_resolution(values, reso=reso)
show_ns = reso_obj == Resolution.RESO_NS
show_us = reso_obj == Resolution.RESO_US
show_ms = reso_obj == Resolution.RESO_MS

elif format == "%Y-%m-%d %H:%M:%S":
# Same format as default, but with hardcoded precision (s)
basic_format = True
show_ns = show_us = show_ms = False

elif format == "%Y-%m-%d %H:%M:%S.%f":
# Same format as default, but with hardcoded precision (us)
basic_format = show_us = True
show_ns = show_ms = False

elif format == "%Y-%m-%d":
# Default format for dates
basic_format_day = True

assert not (basic_format_day and basic_format)

for i in range(N):
val = values[i]

if val == NPY_NAT:
result[i] = na_rep
elif basic_format_day:

pandas_datetime_to_datetimestruct(val, reso, &dts)
res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}'

result[i] = res

elif basic_format:

pandas_datetime_to_datetimestruct(val, reso, &dts)
Expand All @@ -174,6 +198,7 @@ def format_array_from_datetime(

ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)
if format is None:
# Use datetime.str, that returns ts.isoformat(sep=' ')
result[i] = str(ts)
else:

Expand All @@ -183,6 +208,7 @@ def format_array_from_datetime(
# Note: dispatches to pydatetime
result[i] = ts.strftime(format)
except ValueError:
# Use datetime.str, that returns ts.isoformat(sep=' ')
result[i] = str(ts)

return result
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1566,8 +1566,9 @@ cdef class BusinessHour(BusinessMixin):

def _repr_attrs(self) -> str:
out = super()._repr_attrs()
# Use python string formatting to be faster than strftime
hours = ",".join(
f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}'
f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}'
for st, en in zip(self.start, self.end)
)
attrs = [f"{self._prefix}={hours}"]
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ cdef class _Timestamp(ABCTimestamp):
@property
def _date_repr(self) -> str:
# Ideal here would be self.strftime("%Y-%m-%d"), but
# the datetime strftime() methods require year >= 1900
# the datetime strftime() methods require year >= 1900 and is slower
return f'{self.year}-{self.month:02d}-{self.day:02d}'

@property
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,11 +643,14 @@ def _format_native_types(
"""
values = self.astype(object)

# Create the formatter function
if date_format:
formatter = lambda per: per.strftime(date_format)
else:
# Uses `_Period.str` which in turn uses `format_period`
formatter = lambda per: str(per)

# Apply the formatter to all values in the array, possibly with a mask
if self._hasna:
mask = self._isnan
values[mask] = na_rep
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ def _format_native_types(
) -> npt.NDArray[np.object_]:
from pandas.io.formats.format import get_format_timedelta64

# Relies on TimeDelta._repr_base
formatter = get_format_timedelta64(self._ndarray, na_rep)
# equiv: np.array([formatter(x) for x in self._ndarray])
# but independent of dimension
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/excel/_odswriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,18 @@ def _make_table_cell(self, cell) -> tuple[object, Any]:
value = str(val).lower()
pvalue = str(val).upper()
if isinstance(val, datetime.datetime):
# Fast formatting
value = val.isoformat()
# Slow but locale-dependent
pvalue = val.strftime("%c")
return (
pvalue,
TableCell(valuetype="date", datevalue=value, attributes=attributes),
)
elif isinstance(val, datetime.date):
value = val.strftime("%Y-%m-%d")
# Fast formatting
value = f"{val.year}-{val.month:02d}-{val.day:02d}"
# Slow but locale-dependent
pvalue = val.strftime("%x")
return (
pvalue,
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
if x is NaT:
return nat_rep

# Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
# so it already uses string formatting rather than strftime (faster).
return str(x)


Expand All @@ -1778,12 +1780,15 @@ def _format_datetime64_dateonly(
if date_format:
return x.strftime(date_format)
else:
# Timestamp._date_repr relies on string formatting (faster than strftime)
return x._date_repr


def get_format_datetime64(
is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None
) -> Callable:
"""Return a formatter callable taking a datetime64 as input and providing
a string as output"""

if is_dates_only:
return lambda x: _format_datetime64_dateonly(
Expand All @@ -1804,6 +1809,7 @@ def get_format_datetime64_from_values(

ido = is_dates_only(values)
if ido:
# Only dates and no timezone: provide a default format
return date_format or "%Y-%m-%d"
return date_format

Expand Down Expand Up @@ -1877,6 +1883,8 @@ def _formatter(x):

if not isinstance(x, Timedelta):
x = Timedelta(x)

# Timedelta._repr_base uses string formatting (faster than strftime)
result = x._repr_base(format=format)
if box:
result = f"'{result}'"
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,11 @@ def __init__(self, *args, **kwargs) -> None:

# this will transform time(12,34,56,789) into '12:34:56.000789'
# (this is what sqlalchemy does)
sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f"))
def _adapt_time(t):
# This is faster than strftime
return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}"

sqlite3.register_adapter(time, _adapt_time)
super().__init__(*args, **kwargs)

def sql_schema(self):
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/io/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -2595,9 +2595,17 @@ def test_datetime_date(self):
elif self.flavor == "mysql":
tm.assert_frame_equal(res, df)

def test_datetime_time(self):
@pytest.mark.parametrize("tz_aware", [False, True])
def test_datetime_time(self, tz_aware):
# test support for datetime.time, GH #8341
df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"])
if not tz_aware:
tz_times = [time(9, 0, 0), time(9, 1, 30)]
else:
tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific")
tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz())

df = DataFrame(tz_times, columns=["a"])

assert df.to_sql("test_time", self.conn, index=False) == 2
res = read_sql_query("SELECT * FROM test_time", self.conn)
if self.flavor == "sqlite":
Expand Down