From 4e36a9b098369b0b224a54bbe8bfc3ac8cadd1a0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 10:33:07 +0100 Subject: [PATCH 1/6] REGR: to_timedelta precision issues with floating data --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/arrays/timedeltas.py | 10 +++------- pandas/tests/indexes/timedeltas/test_tools.py | 7 +++++++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 2c6d1e01ed89b..20a0d1fbeb258 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -31,6 +31,7 @@ Fixed Regressions - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) +- Fixed regression in :func:`to_timedelta` loosing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) .. _whatsnew_0242.enhancements: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 74fe8072e6924..a39ce509f18da 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -918,13 +918,9 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): copy = copy and not copy_made elif is_float_dtype(data.dtype): - # treat as multiples of the given unit. If after converting to nanos, - # there are fractional components left, these are truncated - # (i.e. NOT rounded) - mask = np.isnan(data) - coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') - data = (coeff * data).astype(np.int64).view('timedelta64[ns]') - data[mask] = iNaT + # object_to_td64ns has custom logic for float -> int conversion + # to avoid precision issues + data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_timedelta64_dtype(data.dtype): diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 58482a174dfd1..819184d4b14f3 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -181,3 +181,10 @@ def test_to_timedelta_on_missing_values(self): actual = pd.to_timedelta(pd.NaT) assert actual.value == timedelta_NaT.astype('int64') + + def test_to_timedelta_float(self): + # https://github.com/pandas-dev/pandas/issues/25077 + arr = np.arange(0, 1, 1e-6)[-10:] + result = pd.to_timedelta(arr, unit='s') + expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype='int64') + tm.assert_numpy_array_equal(result.asi8, expected_asi8) From 39b15aad475b016576a655dedbb445c3690ea23c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 13:50:22 +0100 Subject: [PATCH 2/6] POC using proper logic --- pandas/_libs/tslibs/timedeltas.pyx | 41 ++++++++++++++++++++++++++++++ pandas/core/arrays/timedeltas.py | 15 ++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 37aa05659b70f..3a2a18a86885f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -246,6 +246,47 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray +def precision_from_unit(object unit): + cdef: + int64_t m + int p + + if unit == 'Y': + m = 1000000000L * 31556952 + p = 9 + elif unit == 'M': + m = 1000000000L * 2629746 + p = 9 + elif unit == 'W': + m = 1000000000L * DAY_SECONDS * 7 + p = 9 + elif unit == 'D' or unit == 'd': + m = 1000000000L * DAY_SECONDS + p = 9 + elif unit == 'h': + m = 1000000000L * 3600 + p = 9 + elif unit == 'm': + m = 1000000000L * 60 + p = 9 + elif unit == 's': + m = 1000000000L + p = 9 + elif unit == 'ms': + m = 1000000L + p = 6 + elif unit == 'us': + m = 1000L + p = 3 + elif unit == 'ns' or unit is None: + m = 1L + p = 0 + else: + raise ValueError("cannot cast unit {unit}".format(unit=unit)) + + return m, p + + cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a39ce509f18da..1badb476085bf 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( - array_to_timedelta64, parse_timedelta_unit) + array_to_timedelta64, parse_timedelta_unit, precision_from_unit) import pandas.compat as compat from pandas.util._decorators import Appender @@ -918,9 +918,16 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): copy = copy and not copy_made elif is_float_dtype(data.dtype): - # object_to_td64ns has custom logic for float -> int conversion - # to avoid precision issues - data = objects_to_td64ns(data, unit=unit, errors=errors) + # cast the unit, multiply base/frace separately + # to avoid precision issues from float -> int + mask = np.isnan(data) + m, p = precision_from_unit(unit) + base = data.astype(np.int64) + frac = data - base + if p: + frac = np.round(frac, p) + data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') + data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): From 338a652f72d2313689b0803fc3ad186d1fc83434 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 14:32:19 +0100 Subject: [PATCH 3/6] one possible way to share implementation --- pandas/_libs/tslibs/timedeltas.pyx | 90 +++++++++++------------------- 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 3a2a18a86885f..529737b86c88b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -246,42 +246,49 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -def precision_from_unit(object unit): - cdef: - int64_t m - int p - +cdef int _precision_from_unit(object unit, int64_t* m, int* p): if unit == 'Y': - m = 1000000000L * 31556952 - p = 9 + m[0] = 1000000000L * 31556952 + p[0] = 9 elif unit == 'M': - m = 1000000000L * 2629746 - p = 9 + m[0] = 1000000000L * 2629746 + p[0] = 9 elif unit == 'W': - m = 1000000000L * DAY_SECONDS * 7 - p = 9 + m[0] = 1000000000L * DAY_SECONDS * 7 + p[0] = 9 elif unit == 'D' or unit == 'd': - m = 1000000000L * DAY_SECONDS - p = 9 + m[0] = 1000000000L * DAY_SECONDS + p[0] = 9 elif unit == 'h': - m = 1000000000L * 3600 - p = 9 + m[0] = 1000000000L * 3600 + p[0] = 9 elif unit == 'm': - m = 1000000000L * 60 - p = 9 + m[0] = 1000000000L * 60 + p[0] = 9 elif unit == 's': - m = 1000000000L - p = 9 + m[0] = 1000000000L + p[0] = 9 elif unit == 'ms': - m = 1000000L - p = 6 + m[0] = 1000000L + p[0] = 6 elif unit == 'us': - m = 1000L - p = 3 + m[0] = 1000L + p[0] = 3 elif unit == 'ns' or unit is None: - m = 1L - p = 0 + m[0] = 1L + p[0] = 0 else: + return -1 + return 0 + + +def precision_from_unit(object unit): + cdef: + int64_t m + int p + + status = _precision_from_unit(unit, &m, &p) + if status == -1: raise ValueError("cannot cast unit {unit}".format(unit=unit)) return m, p @@ -294,37 +301,8 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: int64_t m int p - if unit == 'Y': - m = 1000000000L * 31556952 - p = 9 - elif unit == 'M': - m = 1000000000L * 2629746 - p = 9 - elif unit == 'W': - m = 1000000000L * DAY_SECONDS * 7 - p = 9 - elif unit == 'D' or unit == 'd': - m = 1000000000L * DAY_SECONDS - p = 9 - elif unit == 'h': - m = 1000000000L * 3600 - p = 9 - elif unit == 'm': - m = 1000000000L * 60 - p = 9 - elif unit == 's': - m = 1000000000L - p = 9 - elif unit == 'ms': - m = 1000000L - p = 6 - elif unit == 'us': - m = 1000L - p = 3 - elif unit == 'ns' or unit is None: - m = 1L - p = 0 - else: + status = _precision_from_unit(unit, &m, &p) + if status == -1: raise ValueError("cannot cast unit {unit}".format(unit=unit)) # just give me the unit back From 5cc3c39bcab338354fcc3f605410cab47a9d20d9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 14:53:38 +0100 Subject: [PATCH 4/6] clean-up error handling --- pandas/_libs/tslibs/timedeltas.pyx | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 529737b86c88b..af4ab8d0883d5 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -246,7 +246,8 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -cdef int _precision_from_unit(object unit, int64_t* m, int* p): +cdef inline int _precision_from_unit(object unit, + int64_t* m, int* p) except? -1: if unit == 'Y': m[0] = 1000000000L * 31556952 p[0] = 9 @@ -278,7 +279,7 @@ cdef int _precision_from_unit(object unit, int64_t* m, int* p): m[0] = 1L p[0] = 0 else: - return -1 + raise ValueError("cannot cast unit {unit}".format(unit=unit)) return 0 @@ -287,9 +288,7 @@ def precision_from_unit(object unit): int64_t m int p - status = _precision_from_unit(unit, &m, &p) - if status == -1: - raise ValueError("cannot cast unit {unit}".format(unit=unit)) + _precision_from_unit(unit, &m, &p) return m, p @@ -301,9 +300,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: int64_t m int p - status = _precision_from_unit(unit, &m, &p) - if status == -1: - raise ValueError("cannot cast unit {unit}".format(unit=unit)) + _precision_from_unit(unit, &m, &p) # just give me the unit back if ts is None: From 943888bf67b01d4732189f97977d55cb94779bd7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 23:48:03 +0100 Subject: [PATCH 5/6] make it a cpdef --- pandas/_libs/tslibs/timedeltas.pyx | 63 ++++++++++++++---------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index af4ab8d0883d5..5918c7963acf7 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -246,50 +246,47 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -cdef inline int _precision_from_unit(object unit, - int64_t* m, int* p) except? -1: +cpdef inline object precision_from_unit(object unit): + """ + Return a casting of the unit represented to nanoseconds + the precision + to round the fractional part. + """ + cdef: + int64_t m + int p + if unit == 'Y': - m[0] = 1000000000L * 31556952 - p[0] = 9 + m = 1000000000L * 31556952 + p = 9 elif unit == 'M': - m[0] = 1000000000L * 2629746 - p[0] = 9 + m = 1000000000L * 2629746 + p = 9 elif unit == 'W': - m[0] = 1000000000L * DAY_SECONDS * 7 - p[0] = 9 + m = 1000000000L * DAY_SECONDS * 7 + p = 9 elif unit == 'D' or unit == 'd': - m[0] = 1000000000L * DAY_SECONDS - p[0] = 9 + m = 1000000000L * DAY_SECONDS + p = 9 elif unit == 'h': - m[0] = 1000000000L * 3600 - p[0] = 9 + m = 1000000000L * 3600 + p = 9 elif unit == 'm': - m[0] = 1000000000L * 60 - p[0] = 9 + m = 1000000000L * 60 + p = 9 elif unit == 's': - m[0] = 1000000000L - p[0] = 9 + m = 1000000000L + p = 9 elif unit == 'ms': - m[0] = 1000000L - p[0] = 6 + m = 1000000L + p = 6 elif unit == 'us': - m[0] = 1000L - p[0] = 3 + m = 1000L + p = 3 elif unit == 'ns' or unit is None: - m[0] = 1L - p[0] = 0 + m = 1L + p = 0 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) - return 0 - - -def precision_from_unit(object unit): - cdef: - int64_t m - int p - - _precision_from_unit(unit, &m, &p) - return m, p @@ -300,7 +297,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: int64_t m int p - _precision_from_unit(unit, &m, &p) + m, p = precision_from_unit(unit) # just give me the unit back if ts is None: From 74c3e32a4dd69518201ff864441a33d2af355378 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Mar 2019 23:52:19 +0100 Subject: [PATCH 6/6] typo whatsnew --- doc/source/whatsnew/v0.24.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 20a0d1fbeb258..13b90f18c8083 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -31,7 +31,7 @@ Fixed Regressions - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) -- Fixed regression in :func:`to_timedelta` loosing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). +- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) .. _whatsnew_0242.enhancements: