Skip to content

Commit e790c7d

Browse files
arw2019Kevin D Smith
authored and
Kevin D Smith
committed
PERF: pd.to_datetime, unit='s' much slower for float64 than for int64 (pandas-dev#35027)
1 parent f88b228 commit e790c7d

File tree

6 files changed

+61
-22
lines changed

6 files changed

+61
-22
lines changed

asv_bench/benchmarks/timeseries.py

+23
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,29 @@ def time_lookup_and_cleanup(self):
263263
self.ts.index._cleanup()
264264

265265

266+
class ToDatetimeFromIntsFloats:
267+
def setup(self):
268+
self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
269+
self.ts_sec_float = self.ts_sec.astype("float64")
270+
271+
self.ts_nanosec = 1_000_000 * self.ts_sec
272+
self.ts_nanosec_float = self.ts_nanosec.astype("float64")
273+
274+
# speed of int64 and float64 paths should be comparable
275+
276+
def time_nanosec_int64(self):
277+
to_datetime(self.ts_nanosec, unit="ns")
278+
279+
def time_nanosec_float64(self):
280+
to_datetime(self.ts_nanosec_float, unit="ns")
281+
282+
def time_sec_int64(self):
283+
to_datetime(self.ts_sec, unit="s")
284+
285+
def time_sec_float64(self):
286+
to_datetime(self.ts_sec_float, unit="s")
287+
288+
266289
class ToDatetimeYYYYMMDD:
267290
def setup(self):
268291
rng = date_range(start="1/1/2000", periods=10000, freq="D")

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Performance improvements
227227
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
228228
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
229229
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
230+
- Performance improvement in :meth:`pd.to_datetime` with non-`ns` time unit for `float` `dtype` columns (:issue:`20445`)
230231

231232
.. ---------------------------------------------------------------------------
232233

pandas/_libs/tslib.pyx

+29-17
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ from pandas._libs.tslibs.conversion cimport (
4141
cast_from_unit,
4242
convert_datetime_to_tsobject,
4343
get_datetime64_nanos,
44+
precision_from_unit,
4445
)
4546
from pandas._libs.tslibs.nattype cimport (
4647
NPY_NAT,
@@ -205,6 +206,7 @@ def array_with_unit_to_datetime(
205206
cdef:
206207
Py_ssize_t i, j, n=len(values)
207208
int64_t m
209+
int prec = 0
208210
ndarray[float64_t] fvalues
209211
bint is_ignore = errors=='ignore'
210212
bint is_coerce = errors=='coerce'
@@ -217,38 +219,48 @@ def array_with_unit_to_datetime(
217219

218220
assert is_ignore or is_coerce or is_raise
219221

220-
if unit == 'ns':
221-
if issubclass(values.dtype.type, np.integer):
222-
result = values.astype('M8[ns]')
222+
if unit == "ns":
223+
if issubclass(values.dtype.type, (np.integer, np.float_)):
224+
result = values.astype("M8[ns]", copy=False)
223225
else:
224226
result, tz = array_to_datetime(values.astype(object), errors=errors)
225227
return result, tz
226228

227-
m = cast_from_unit(None, unit)
229+
m, p = precision_from_unit(unit)
228230

229231
if is_raise:
230-
231-
# try a quick conversion to i8
232+
# try a quick conversion to i8/f8
232233
# if we have nulls that are not type-compat
233234
# then need to iterate
234-
if values.dtype.kind == "i":
235-
# Note: this condition makes the casting="same_kind" redundant
236-
iresult = values.astype('i8', casting='same_kind', copy=False)
237-
# fill by comparing to NPY_NAT constant
235+
236+
if values.dtype.kind == "i" or values.dtype.kind == "f":
237+
iresult = values.astype("i8", copy=False)
238+
# fill missing values by comparing to NPY_NAT
238239
mask = iresult == NPY_NAT
239240
iresult[mask] = 0
240-
fvalues = iresult.astype('f8') * m
241+
fvalues = iresult.astype("f8") * m
241242
need_to_iterate = False
242243

243-
# check the bounds
244244
if not need_to_iterate:
245-
246-
if ((fvalues < Timestamp.min.value).any()
247-
or (fvalues > Timestamp.max.value).any()):
245+
# check the bounds
246+
if (fvalues < Timestamp.min.value).any() or (
247+
(fvalues > Timestamp.max.value).any()
248+
):
248249
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
249-
result = (iresult * m).astype('M8[ns]')
250-
iresult = result.view('i8')
250+
251+
if values.dtype.kind == "i":
252+
result = (iresult * m).astype("M8[ns]")
253+
254+
elif values.dtype.kind == "f":
255+
fresult = (values * m).astype("f8")
256+
fresult[mask] = 0
257+
if prec:
258+
fresult = round(fresult, prec)
259+
result = fresult.astype("M8[ns]", copy=False)
260+
261+
iresult = result.view("i8")
251262
iresult[mask] = NPY_NAT
263+
252264
return result, tz
253265

254266
result = np.empty(n, dtype='M8[ns]')

pandas/_libs/tslibs/conversion.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@ cdef int64_t get_datetime64_nanos(object val) except? -1
2424

2525
cpdef datetime localize_pydatetime(datetime dt, object tz)
2626
cdef int64_t cast_from_unit(object ts, str unit) except? -1
27+
cpdef (int64_t, int) precision_from_unit(str unit)
2728

2829
cdef int64_t normalize_i8_stamp(int64_t local_val) nogil

pandas/tests/io/sas/data/datetime.csv

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Date1,Date2,DateTime,DateTimeHi,Taiw
2-
1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145226,1912-01-01
2+
1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145225,1912-01-01
33
1960-01-01,1960-01-01,1960-01-01 00:00:00,1960-01-01 00:00:00.000000,1960-01-01
44
2016-02-29,2016-02-29,2016-02-29 23:59:59,2016-02-29 23:59:59.123456,2016-02-29
5-
2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854774,2262-04-11
5+
2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854775,2262-04-11

pandas/tests/tools/test_to_datetime.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1217,10 +1217,10 @@ def test_unit_mixed(self, cache):
12171217

12181218
@pytest.mark.parametrize("cache", [True, False])
12191219
def test_unit_rounding(self, cache):
1220-
# GH 14156: argument will incur floating point errors but no
1221-
# premature rounding
1220+
# GH 14156 & GH 20445: argument will incur floating point errors
1221+
# but no premature rounding
12221222
result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache)
1223-
expected = pd.Timestamp("2015-06-19 19:55:31.877000093")
1223+
expected = pd.Timestamp("2015-06-19 19:55:31.877000192")
12241224
assert result == expected
12251225

12261226
@pytest.mark.parametrize("cache", [True, False])
@@ -1454,6 +1454,8 @@ def test_to_datetime_unit(self):
14541454
]
14551455
+ [NaT]
14561456
)
1457+
# GH20455 argument will incur floating point errors but no premature rounding
1458+
result = result.round("ms")
14571459
tm.assert_series_equal(result, expected)
14581460

14591461
s = pd.concat(

0 commit comments

Comments
 (0)