Skip to content

Commit ff6613e

Browse files
smarieyehoshuadimarsky
authored andcommitted
PERF: Some faster date/time string formatting (pandas-dev#46759)
1 parent 8f13114 commit ff6613e

File tree

12 files changed

+164
-18
lines changed

12 files changed

+164
-18
lines changed

asv_bench/benchmarks/io/sql.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def setup(self, connection):
3939
index=tm.makeStringIndex(N),
4040
)
4141
self.df.loc[1000:3000, "float_with_nan"] = np.nan
42+
self.df["date"] = self.df["datetime"].dt.date
43+
self.df["time"] = self.df["datetime"].dt.time
4244
self.df["datetime_string"] = self.df["datetime"].astype(str)
4345
self.df.to_sql(self.table_name, self.con, if_exists="replace")
4446

@@ -53,7 +55,16 @@ class WriteSQLDtypes:
5355

5456
params = (
5557
["sqlalchemy", "sqlite"],
56-
["float", "float_with_nan", "string", "bool", "int", "datetime"],
58+
[
59+
"float",
60+
"float_with_nan",
61+
"string",
62+
"bool",
63+
"int",
64+
"date",
65+
"time",
66+
"datetime",
67+
],
5768
)
5869
param_names = ["connection", "dtype"]
5970

@@ -78,6 +89,8 @@ def setup(self, connection, dtype):
7889
index=tm.makeStringIndex(N),
7990
)
8091
self.df.loc[1000:3000, "float_with_nan"] = np.nan
92+
self.df["date"] = self.df["datetime"].dt.date
93+
self.df["time"] = self.df["datetime"].dt.time
8194
self.df["datetime_string"] = self.df["datetime"].astype(str)
8295
self.df.to_sql(self.table_name, self.con, if_exists="replace")
8396

@@ -105,6 +118,8 @@ def setup(self):
105118
index=tm.makeStringIndex(N),
106119
)
107120
self.df.loc[1000:3000, "float_with_nan"] = np.nan
121+
self.df["date"] = self.df["datetime"].dt.date
122+
self.df["time"] = self.df["datetime"].dt.time
108123
self.df["datetime_string"] = self.df["datetime"].astype(str)
109124
self.df.to_sql(self.table_name, self.con, if_exists="replace")
110125

@@ -122,7 +137,16 @@ def time_read_sql_table_parse_dates(self):
122137

123138
class ReadSQLTableDtypes:
124139

125-
params = ["float", "float_with_nan", "string", "bool", "int", "datetime"]
140+
params = [
141+
"float",
142+
"float_with_nan",
143+
"string",
144+
"bool",
145+
"int",
146+
"date",
147+
"time",
148+
"datetime",
149+
]
126150
param_names = ["dtype"]
127151

128152
def setup(self, dtype):
@@ -141,6 +165,8 @@ def setup(self, dtype):
141165
index=tm.makeStringIndex(N),
142166
)
143167
self.df.loc[1000:3000, "float_with_nan"] = np.nan
168+
self.df["date"] = self.df["datetime"].dt.date
169+
self.df["time"] = self.df["datetime"].dt.time
144170
self.df["datetime_string"] = self.df["datetime"].astype(str)
145171
self.df.to_sql(self.table_name, self.con, if_exists="replace")
146172

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import numpy as np
2+
3+
import pandas as pd
4+
from pandas import offsets
5+
6+
7+
class DatetimeStrftime:
8+
timeout = 1500
9+
params = [1000, 10000]
10+
param_names = ["obs"]
11+
12+
def setup(self, obs):
13+
d = "2018-11-29"
14+
dt = "2018-11-26 11:18:27.0"
15+
self.data = pd.DataFrame(
16+
{
17+
"dt": [np.datetime64(dt)] * obs,
18+
"d": [np.datetime64(d)] * obs,
19+
"r": [np.random.uniform()] * obs,
20+
}
21+
)
22+
23+
def time_frame_date_to_str(self, obs):
24+
self.data["d"].astype(str)
25+
26+
def time_frame_date_formatting_default(self, obs):
27+
self.data["d"].dt.strftime(date_format="%Y-%m-%d")
28+
29+
def time_frame_date_formatting_custom(self, obs):
30+
self.data["d"].dt.strftime(date_format="%Y---%m---%d")
31+
32+
def time_frame_datetime_to_str(self, obs):
33+
self.data["dt"].astype(str)
34+
35+
def time_frame_datetime_formatting_default_date_only(self, obs):
36+
self.data["dt"].dt.strftime(date_format="%Y-%m-%d")
37+
38+
def time_frame_datetime_formatting_default(self, obs):
39+
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S")
40+
41+
def time_frame_datetime_formatting_default_with_float(self, obs):
42+
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f")
43+
44+
def time_frame_datetime_formatting_custom(self, obs):
45+
self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")
46+
47+
48+
class BusinessHourStrftime:
49+
timeout = 1500
50+
params = [1000, 10000]
51+
param_names = ["obs"]
52+
53+
def setup(self, obs):
54+
self.data = pd.DataFrame(
55+
{
56+
"off": [offsets.BusinessHour()] * obs,
57+
}
58+
)
59+
60+
def time_frame_offset_str(self, obs):
61+
self.data["off"].apply(str)
62+
63+
def time_frame_offset_repr(self, obs):
64+
self.data["off"].apply(repr)

doc/source/whatsnew/v1.5.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -792,6 +792,10 @@ Performance improvements
792792
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
793793
- Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
794794
- Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`)
795+
- Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`)
796+
- Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
797+
- Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
798+
-
795799

796800
.. ---------------------------------------------------------------------------
797801
.. _whatsnew_150.bug_fixes:

pandas/_libs/tslib.pyx

+33-10
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def format_array_from_datetime(
131131
cdef:
132132
int64_t val, ns, N = values.size
133133
bint show_ms = False, show_us = False, show_ns = False
134-
bint basic_format = False
134+
bint basic_format = False, basic_format_day = False
135135
_Timestamp ts
136136
object res
137137
npy_datetimestruct dts
@@ -147,21 +147,43 @@ def format_array_from_datetime(
147147
if na_rep is None:
148148
na_rep = 'NaT'
149149

150-
# if we don't have a format nor tz, then choose
151-
# a format based on precision
152-
basic_format = format is None and tz is None
153-
if basic_format:
154-
reso_obj = get_resolution(values, reso=reso)
155-
show_ns = reso_obj == Resolution.RESO_NS
156-
show_us = reso_obj == Resolution.RESO_US
157-
show_ms = reso_obj == Resolution.RESO_MS
150+
if tz is None:
151+
# if we don't have a format nor tz, then choose
152+
# a format based on precision
153+
basic_format = format is None
154+
if basic_format:
155+
reso_obj = get_resolution(values, reso=reso)
156+
show_ns = reso_obj == Resolution.RESO_NS
157+
show_us = reso_obj == Resolution.RESO_US
158+
show_ms = reso_obj == Resolution.RESO_MS
159+
160+
elif format == "%Y-%m-%d %H:%M:%S":
161+
# Same format as default, but with hardcoded precision (s)
162+
basic_format = True
163+
show_ns = show_us = show_ms = False
164+
165+
elif format == "%Y-%m-%d %H:%M:%S.%f":
166+
# Same format as default, but with hardcoded precision (us)
167+
basic_format = show_us = True
168+
show_ns = show_ms = False
169+
170+
elif format == "%Y-%m-%d":
171+
# Default format for dates
172+
basic_format_day = True
173+
174+
assert not (basic_format_day and basic_format)
158175

159176
for i in range(N):
160177
# Analogous to: utc_val = values[i]
161178
val = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
162179

163180
if val == NPY_NAT:
164181
res = na_rep
182+
elif basic_format_day:
183+
184+
pandas_datetime_to_datetimestruct(val, reso, &dts)
185+
res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}'
186+
165187
elif basic_format:
166188

167189
pandas_datetime_to_datetimestruct(val, reso, &dts)
@@ -176,11 +198,11 @@ def format_array_from_datetime(
176198
elif show_ms:
177199
res += f'.{dts.us // 1000:03d}'
178200

179-
180201
else:
181202

182203
ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)
183204
if format is None:
205+
# Use datetime.str, that returns ts.isoformat(sep=' ')
184206
res = str(ts)
185207
else:
186208

@@ -190,6 +212,7 @@ def format_array_from_datetime(
190212
# Note: dispatches to pydatetime
191213
res = ts.strftime(format)
192214
except ValueError:
215+
# Use datetime.str, that returns ts.isoformat(sep=' ')
193216
res = str(ts)
194217

195218
# Note: we can index result directly instead of using PyArray_MultiIter_DATA

pandas/_libs/tslibs/offsets.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -1566,8 +1566,9 @@ cdef class BusinessHour(BusinessMixin):
15661566

15671567
def _repr_attrs(self) -> str:
15681568
out = super()._repr_attrs()
1569+
# Use python string formatting to be faster than strftime
15691570
hours = ",".join(
1570-
f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}'
1571+
f'{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}'
15711572
for st, en in zip(self.start, self.end)
15721573
)
15731574
attrs = [f"{self._prefix}={hours}"]

pandas/_libs/tslibs/timestamps.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,7 @@ cdef class _Timestamp(ABCTimestamp):
10141014
@property
10151015
def _date_repr(self) -> str:
10161016
# Ideal here would be self.strftime("%Y-%m-%d"), but
1017-
# the datetime strftime() methods require year >= 1900
1017+
# the datetime strftime() methods require year >= 1900 and is slower
10181018
return f'{self.year}-{self.month:02d}-{self.day:02d}'
10191019

10201020
@property

pandas/core/arrays/period.py

+3
Original file line numberDiff line numberDiff line change
@@ -646,11 +646,14 @@ def _format_native_types(
646646
"""
647647
values = self.astype(object)
648648

649+
# Create the formatter function
649650
if date_format:
650651
formatter = lambda per: per.strftime(date_format)
651652
else:
653+
# Uses `_Period.str` which in turn uses `format_period`
652654
formatter = lambda per: str(per)
653655

656+
# Apply the formatter to all values in the array, possibly with a mask
654657
if self._hasna:
655658
mask = self._isnan
656659
values[mask] = na_rep

pandas/core/arrays/timedeltas.py

+1
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ def _format_native_types(
430430
) -> npt.NDArray[np.object_]:
431431
from pandas.io.formats.format import get_format_timedelta64
432432

433+
# Relies on TimeDelta._repr_base
433434
formatter = get_format_timedelta64(self._ndarray, na_rep)
434435
# equiv: np.array([formatter(x) for x in self._ndarray])
435436
# but independent of dimension

pandas/io/excel/_odswriter.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -189,14 +189,18 @@ def _make_table_cell(self, cell) -> tuple[object, Any]:
189189
value = str(val).lower()
190190
pvalue = str(val).upper()
191191
if isinstance(val, datetime.datetime):
192+
# Fast formatting
192193
value = val.isoformat()
194+
# Slow but locale-dependent
193195
pvalue = val.strftime("%c")
194196
return (
195197
pvalue,
196198
TableCell(valuetype="date", datevalue=value, attributes=attributes),
197199
)
198200
elif isinstance(val, datetime.date):
199-
value = val.strftime("%Y-%m-%d")
201+
# Fast formatting
202+
value = f"{val.year}-{val.month:02d}-{val.day:02d}"
203+
# Slow but locale-dependent
200204
pvalue = val.strftime("%x")
201205
return (
202206
pvalue,

pandas/io/formats/format.py

+8
Original file line numberDiff line numberDiff line change
@@ -1768,6 +1768,8 @@ def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
17681768
if x is NaT:
17691769
return nat_rep
17701770

1771+
# Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
1772+
# so it already uses string formatting rather than strftime (faster).
17711773
return str(x)
17721774

17731775

@@ -1782,12 +1784,15 @@ def _format_datetime64_dateonly(
17821784
if date_format:
17831785
return x.strftime(date_format)
17841786
else:
1787+
# Timestamp._date_repr relies on string formatting (faster than strftime)
17851788
return x._date_repr
17861789

17871790

17881791
def get_format_datetime64(
17891792
is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None
17901793
) -> Callable:
1794+
"""Return a formatter callable taking a datetime64 as input and providing
1795+
a string as output"""
17911796

17921797
if is_dates_only:
17931798
return lambda x: _format_datetime64_dateonly(
@@ -1808,6 +1813,7 @@ def get_format_datetime64_from_values(
18081813

18091814
ido = is_dates_only(values)
18101815
if ido:
1816+
# Only dates and no timezone: provide a default format
18111817
return date_format or "%Y-%m-%d"
18121818
return date_format
18131819

@@ -1881,6 +1887,8 @@ def _formatter(x):
18811887

18821888
if not isinstance(x, Timedelta):
18831889
x = Timedelta(x)
1890+
1891+
# Timedelta._repr_base uses string formatting (faster than strftime)
18841892
result = x._repr_base(format=format)
18851893
if box:
18861894
result = f"'{result}'"

pandas/io/sql.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1848,7 +1848,11 @@ def __init__(self, *args, **kwargs) -> None:
18481848

18491849
# this will transform time(12,34,56,789) into '12:34:56.000789'
18501850
# (this is what sqlalchemy does)
1851-
sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f"))
1851+
def _adapt_time(t):
1852+
# This is faster than strftime
1853+
return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}"
1854+
1855+
sqlite3.register_adapter(time, _adapt_time)
18521856
super().__init__(*args, **kwargs)
18531857

18541858
def sql_schema(self) -> str:

pandas/tests/io/test_sql.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -2595,9 +2595,17 @@ def test_datetime_date(self):
25952595
elif self.flavor == "mysql":
25962596
tm.assert_frame_equal(res, df)
25972597

2598-
def test_datetime_time(self):
2598+
@pytest.mark.parametrize("tz_aware", [False, True])
2599+
def test_datetime_time(self, tz_aware):
25992600
# test support for datetime.time, GH #8341
2600-
df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"])
2601+
if not tz_aware:
2602+
tz_times = [time(9, 0, 0), time(9, 1, 30)]
2603+
else:
2604+
tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific")
2605+
tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz())
2606+
2607+
df = DataFrame(tz_times, columns=["a"])
2608+
26012609
assert df.to_sql("test_time", self.conn, index=False) == 2
26022610
res = read_sql_query("SELECT * FROM test_time", self.conn)
26032611
if self.flavor == "sqlite":

0 commit comments

Comments
 (0)