PERF: pd.to_datetime, unit='s' much slower for float64 than for int64 (pandas-dev#35027)

arw2019 · Kevin D Smith · commit e790c7dbf4d3 · 2020-11-02T08:51:46.000-06:00
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -263,6 +263,29 @@ def time_lookup_and_cleanup(self):
         self.ts.index._cleanup()
 
 
+class ToDatetimeFromIntsFloats:
+    def setup(self):
+        self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64")
+        self.ts_sec_float = self.ts_sec.astype("float64")
+
+        self.ts_nanosec = 1_000_000 * self.ts_sec
+        self.ts_nanosec_float = self.ts_nanosec.astype("float64")
+
+    # speed of int64 and float64 paths should be comparable
+
+    def time_nanosec_int64(self):
+        to_datetime(self.ts_nanosec, unit="ns")
+
+    def time_nanosec_float64(self):
+        to_datetime(self.ts_nanosec_float, unit="ns")
+
+    def time_sec_int64(self):
+        to_datetime(self.ts_sec, unit="s")
+
+    def time_sec_float64(self):
+        to_datetime(self.ts_sec_float, unit="s")
+
+
 class ToDatetimeYYYYMMDD:
     def setup(self):
         rng = date_range(start="1/1/2000", periods=10000, freq="D")
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -227,6 +227,7 @@ Performance improvements
 - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
 - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
 - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
+- Performance improvement in :meth:`pd.to_datetime` with non-`ns` time unit for `float` `dtype` columns (:issue:`20445`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -41,6 +41,7 @@ from pandas._libs.tslibs.conversion cimport (
     cast_from_unit,
     convert_datetime_to_tsobject,
     get_datetime64_nanos,
+    precision_from_unit,
 )
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -205,6 +206,7 @@ def array_with_unit_to_datetime(
     cdef:
         Py_ssize_t i, j, n=len(values)
         int64_t m
+        int prec = 0
         ndarray[float64_t] fvalues
         bint is_ignore = errors=='ignore'
         bint is_coerce = errors=='coerce'
@@ -217,38 +219,48 @@ def array_with_unit_to_datetime(
 
     assert is_ignore or is_coerce or is_raise
 
-    if unit == 'ns':
-        if issubclass(values.dtype.type, np.integer):
-            result = values.astype('M8[ns]')
+    if unit == "ns":
+        if issubclass(values.dtype.type, (np.integer, np.float_)):
+            result = values.astype("M8[ns]", copy=False)
         else:
             result, tz = array_to_datetime(values.astype(object), errors=errors)
         return result, tz
 
-    m = cast_from_unit(None, unit)
+    m, p = precision_from_unit(unit)
 
     if is_raise:
-
-        # try a quick conversion to i8
+        # try a quick conversion to i8/f8
         # if we have nulls that are not type-compat
         # then need to iterate
-        if values.dtype.kind == "i":
-            # Note: this condition makes the casting="same_kind" redundant
-            iresult = values.astype('i8', casting='same_kind', copy=False)
-            # fill by comparing to NPY_NAT constant
+
+        if values.dtype.kind == "i" or values.dtype.kind == "f":
+            iresult = values.astype("i8", copy=False)
+            # fill missing values by comparing to NPY_NAT
             mask = iresult == NPY_NAT
             iresult[mask] = 0
-            fvalues = iresult.astype('f8') * m
+            fvalues = iresult.astype("f8") * m
             need_to_iterate = False
 
-        # check the bounds
         if not need_to_iterate:
-
-            if ((fvalues < Timestamp.min.value).any()
-                    or (fvalues > Timestamp.max.value).any()):
+            # check the bounds
+            if (fvalues < Timestamp.min.value).any() or (
+                (fvalues > Timestamp.max.value).any()
+            ):
                 raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
-            result = (iresult * m).astype('M8[ns]')
-            iresult = result.view('i8')
+
+            if values.dtype.kind == "i":
+                result = (iresult * m).astype("M8[ns]")
+
+            elif values.dtype.kind == "f":
+                fresult = (values * m).astype("f8")
+                fresult[mask] = 0
+                if prec:
+                    fresult = round(fresult, prec)
+                result = fresult.astype("M8[ns]", copy=False)
+
+            iresult = result.view("i8")
             iresult[mask] = NPY_NAT
+
             return result, tz
 
     result = np.empty(n, dtype='M8[ns]')
diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd
@@ -24,5 +24,6 @@ cdef int64_t get_datetime64_nanos(object val) except? -1
 
 cpdef datetime localize_pydatetime(datetime dt, object tz)
 cdef int64_t cast_from_unit(object ts, str unit) except? -1
+cpdef (int64_t, int) precision_from_unit(str unit)
 
 cdef int64_t normalize_i8_stamp(int64_t local_val) nogil
diff --git a/pandas/tests/io/sas/data/datetime.csv b/pandas/tests/io/sas/data/datetime.csv
@@ -1,5 +1,5 @@
 Date1,Date2,DateTime,DateTimeHi,Taiw
-1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145226,1912-01-01
+1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145225,1912-01-01
 1960-01-01,1960-01-01,1960-01-01 00:00:00,1960-01-01 00:00:00.000000,1960-01-01
 2016-02-29,2016-02-29,2016-02-29 23:59:59,2016-02-29 23:59:59.123456,2016-02-29
-2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854774,2262-04-11
+2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854775,2262-04-11
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -1217,10 +1217,10 @@ def test_unit_mixed(self, cache):
 
     @pytest.mark.parametrize("cache", [True, False])
     def test_unit_rounding(self, cache):
-        # GH 14156: argument will incur floating point errors but no
-        # premature rounding
+        # GH 14156 & GH 20445: argument will incur floating point errors
+        # but no premature rounding
         result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache)
-        expected = pd.Timestamp("2015-06-19 19:55:31.877000093")
+        expected = pd.Timestamp("2015-06-19 19:55:31.877000192")
         assert result == expected
 
     @pytest.mark.parametrize("cache", [True, False])
@@ -1454,6 +1454,8 @@ def test_to_datetime_unit(self):
             ]
             + [NaT]
         )
+        # GH20455 argument will incur floating point errors but no premature rounding
+        result = result.round("ms")
         tm.assert_series_equal(result, expected)
 
         s = pd.concat(