Skip to content

Commit f021bbc

Browse files
miccolijreback
authored andcommitted
Fix Timestamp.round errors (pandas-dev#22802)
1 parent b92b043 commit f021bbc

File tree

5 files changed

+192
-44
lines changed

5 files changed

+192
-44
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,7 @@ Datetimelike
654654
- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`)
655655
- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`)
656656
- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`)
657+
- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`)
657658

658659
Timedelta
659660
^^^^^^^^^

pandas/_libs/tslibs/timestamps.pyx

+101-36
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ cimport ccalendar
2222
from conversion import tz_localize_to_utc, normalize_i8_timestamps
2323
from conversion cimport (tz_convert_single, _TSObject,
2424
convert_to_tsobject, convert_datetime_to_tsobject)
25+
import enum
2526
from fields import get_start_end_field, get_date_name_field
2627
from nattype import NaT
2728
from nattype cimport NPY_NAT
@@ -57,50 +58,114 @@ cdef inline object create_timestamp_from_ts(int64_t value,
5758
return ts_base
5859

5960

60-
def round_ns(values, rounder, freq):
61+
@enum.unique
62+
class RoundTo(enum.Enum):
6163
"""
62-
Applies rounding function at given frequency
64+
enumeration defining the available rounding modes
65+
66+
Attributes
67+
----------
68+
MINUS_INFTY
69+
round towards -∞, or floor [2]_
70+
PLUS_INFTY
71+
round towards +∞, or ceil [3]_
72+
NEAREST_HALF_EVEN
73+
round to nearest, tie-break half to even [6]_
74+
NEAREST_HALF_MINUS_INFTY
75+
round to nearest, tie-break half to -∞ [5]_
76+
NEAREST_HALF_PLUS_INFTY
77+
round to nearest, tie-break half to +∞ [4]_
78+
79+
80+
References
81+
----------
82+
.. [1] "Rounding - Wikipedia"
83+
https://en.wikipedia.org/wiki/Rounding
84+
.. [2] "Rounding down"
85+
https://en.wikipedia.org/wiki/Rounding#Rounding_down
86+
.. [3] "Rounding up"
87+
https://en.wikipedia.org/wiki/Rounding#Rounding_up
88+
.. [4] "Round half up"
89+
https://en.wikipedia.org/wiki/Rounding#Round_half_up
90+
.. [5] "Round half down"
91+
https://en.wikipedia.org/wiki/Rounding#Round_half_down
92+
.. [6] "Round half to even"
93+
https://en.wikipedia.org/wiki/Rounding#Round_half_to_even
94+
"""
95+
MINUS_INFTY = 0
96+
PLUS_INFTY = 1
97+
NEAREST_HALF_EVEN = 2
98+
NEAREST_HALF_PLUS_INFTY = 3
99+
NEAREST_HALF_MINUS_INFTY = 4
100+
101+
102+
cdef inline _npdivmod(x1, x2):
103+
"""implement divmod for numpy < 1.13"""
104+
return np.floor_divide(x1, x2), np.remainder(x1, x2)
105+
106+
107+
try:
108+
from numpy import divmod as npdivmod
109+
except ImportError:
110+
npdivmod = _npdivmod
111+
112+
113+
cdef inline _floor_int64(values, unit):
114+
return values - np.remainder(values, unit)
115+
116+
cdef inline _ceil_int64(values, unit):
117+
return values + np.remainder(-values, unit)
118+
119+
cdef inline _rounddown_int64(values, unit):
120+
return _ceil_int64(values - unit//2, unit)
121+
122+
cdef inline _roundup_int64(values, unit):
123+
return _floor_int64(values + unit//2, unit)
124+
125+
126+
def round_nsint64(values, mode, freq):
127+
"""
128+
Applies rounding mode at given frequency
63129
64130
Parameters
65131
----------
66132
values : :obj:`ndarray`
67-
rounder : function, eg. 'ceil', 'floor', 'round'
133+
mode : instance of `RoundTo` enumeration
68134
freq : str, obj
69135
70136
Returns
71137
-------
72138
:obj:`ndarray`
73139
"""
140+
141+
if not isinstance(mode, RoundTo):
142+
raise ValueError('mode should be a RoundTo member')
143+
74144
unit = to_offset(freq).nanos
75145

76-
# GH21262 If the Timestamp is multiple of the freq str
77-
# don't apply any rounding
78-
mask = values % unit == 0
79-
if mask.all():
80-
return values
81-
r = values.copy()
82-
83-
if unit < 1000:
84-
# for nano rounding, work with the last 6 digits separately
85-
# due to float precision
86-
buff = 1000000
87-
r[~mask] = (buff * (values[~mask] // buff) +
88-
unit * (rounder((values[~mask] % buff) *
89-
(1 / float(unit)))).astype('i8'))
90-
else:
91-
if unit % 1000 != 0:
92-
msg = 'Precision will be lost using frequency: {}'
93-
warnings.warn(msg.format(freq))
94-
# GH19206
95-
# to deal with round-off when unit is large
96-
if unit >= 1e9:
97-
divisor = 10 ** int(np.log10(unit / 1e7))
98-
else:
99-
divisor = 10
100-
r[~mask] = (unit * rounder((values[~mask] *
101-
(divisor / float(unit))) / divisor)
102-
.astype('i8'))
103-
return r
146+
if mode is RoundTo.MINUS_INFTY:
147+
return _floor_int64(values, unit)
148+
elif mode is RoundTo.PLUS_INFTY:
149+
return _ceil_int64(values, unit)
150+
elif mode is RoundTo.NEAREST_HALF_MINUS_INFTY:
151+
return _rounddown_int64(values, unit)
152+
elif mode is RoundTo.NEAREST_HALF_PLUS_INFTY:
153+
return _roundup_int64(values, unit)
154+
elif mode is RoundTo.NEAREST_HALF_EVEN:
155+
# for odd unit there is no need of a tie break
156+
if unit % 2:
157+
return _rounddown_int64(values, unit)
158+
quotient, remainder = npdivmod(values, unit)
159+
mask = np.logical_or(
160+
remainder > (unit // 2),
161+
np.logical_and(remainder == (unit // 2), quotient % 2)
162+
)
163+
quotient[mask] += 1
164+
return quotient * unit
165+
166+
# if/elif above should catch all rounding modes defined in enum 'RoundTo':
167+
# if flow of control arrives here, it is a bug
168+
assert False, "round_nsint64 called with an unrecognized rounding mode"
104169

105170

106171
# This is PITA. Because we inherit from datetime, which has very specific
@@ -656,7 +721,7 @@ class Timestamp(_Timestamp):
656721

657722
return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq)
658723

659-
def _round(self, freq, rounder, ambiguous='raise'):
724+
def _round(self, freq, mode, ambiguous='raise'):
660725
if self.tz is not None:
661726
value = self.tz_localize(None).value
662727
else:
@@ -665,7 +730,7 @@ class Timestamp(_Timestamp):
665730
value = np.array([value], dtype=np.int64)
666731

667732
# Will only ever contain 1 element for timestamp
668-
r = round_ns(value, rounder, freq)[0]
733+
r = round_nsint64(value, mode, freq)[0]
669734
result = Timestamp(r, unit='ns')
670735
if self.tz is not None:
671736
result = result.tz_localize(self.tz, ambiguous=ambiguous)
@@ -694,7 +759,7 @@ class Timestamp(_Timestamp):
694759
------
695760
ValueError if the freq cannot be converted
696761
"""
697-
return self._round(freq, np.round, ambiguous)
762+
return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous)
698763

699764
def floor(self, freq, ambiguous='raise'):
700765
"""
@@ -715,7 +780,7 @@ class Timestamp(_Timestamp):
715780
------
716781
ValueError if the freq cannot be converted
717782
"""
718-
return self._round(freq, np.floor, ambiguous)
783+
return self._round(freq, RoundTo.MINUS_INFTY, ambiguous)
719784

720785
def ceil(self, freq, ambiguous='raise'):
721786
"""
@@ -736,7 +801,7 @@ class Timestamp(_Timestamp):
736801
------
737802
ValueError if the freq cannot be converted
738803
"""
739-
return self._round(freq, np.ceil, ambiguous)
804+
return self._round(freq, RoundTo.PLUS_INFTY, ambiguous)
740805

741806
@property
742807
def tz(self):

pandas/core/indexes/datetimelike.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import numpy as np
1212

1313
from pandas._libs import lib, iNaT, NaT
14-
from pandas._libs.tslibs.timestamps import round_ns
14+
from pandas._libs.tslibs.timestamps import round_nsint64, RoundTo
1515

1616
from pandas.core.dtypes.common import (
1717
ensure_int64,
@@ -180,10 +180,10 @@ class TimelikeOps(object):
180180
"""
181181
)
182182

183-
def _round(self, freq, rounder, ambiguous):
183+
def _round(self, freq, mode, ambiguous):
184184
# round the local times
185185
values = _ensure_datetimelike_to_i8(self)
186-
result = round_ns(values, rounder, freq)
186+
result = round_nsint64(values, mode, freq)
187187
result = self._maybe_mask_results(result, fill_value=NaT)
188188

189189
attribs = self._get_attributes_dict()
@@ -197,15 +197,15 @@ def _round(self, freq, rounder, ambiguous):
197197

198198
@Appender((_round_doc + _round_example).format(op="round"))
199199
def round(self, freq, ambiguous='raise'):
200-
return self._round(freq, np.round, ambiguous)
200+
return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous)
201201

202202
@Appender((_round_doc + _floor_example).format(op="floor"))
203203
def floor(self, freq, ambiguous='raise'):
204-
return self._round(freq, np.floor, ambiguous)
204+
return self._round(freq, RoundTo.MINUS_INFTY, ambiguous)
205205

206206
@Appender((_round_doc + _ceil_example).format(op="ceil"))
207207
def ceil(self, freq, ambiguous='raise'):
208-
return self._round(freq, np.ceil, ambiguous)
208+
return self._round(freq, RoundTo.PLUS_INFTY, ambiguous)
209209

210210

211211
class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin):

pandas/tests/indexes/datetimes/test_scalar_compat.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pandas as pd
1212

1313
from pandas import date_range, Timestamp, DatetimeIndex
14+
from pandas.tseries.frequencies import to_offset
1415

1516

1617
class TestDatetimeIndexOps(object):
@@ -124,7 +125,7 @@ def test_round(self, tz_naive_fixture):
124125
expected = DatetimeIndex(['2016-10-17 12:00:00.001501030'])
125126
tm.assert_index_equal(result, expected)
126127

127-
with tm.assert_produces_warning():
128+
with tm.assert_produces_warning(False):
128129
ts = '2016-10-17 12:00:00.001501031'
129130
DatetimeIndex([ts]).round('1010ns')
130131

@@ -169,6 +170,46 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected):
169170
expected = DatetimeIndex(list(expected))
170171
assert expected.equals(result)
171172

173+
@pytest.mark.parametrize('start, index_freq, periods', [
174+
('2018-01-01', '12H', 25),
175+
('2018-01-01 0:0:0.124999', '1ns', 1000),
176+
])
177+
@pytest.mark.parametrize('round_freq', [
178+
'2ns', '3ns', '4ns', '5ns', '6ns', '7ns',
179+
'250ns', '500ns', '750ns',
180+
'1us', '19us', '250us', '500us', '750us',
181+
'1s', '2s', '3s',
182+
'12H', '1D',
183+
])
184+
def test_round_int64(self, start, index_freq, periods, round_freq):
185+
dt = DatetimeIndex(start=start, freq=index_freq, periods=periods)
186+
unit = to_offset(round_freq).nanos
187+
188+
# test floor
189+
result = dt.floor(round_freq)
190+
diff = dt.asi8 - result.asi8
191+
mod = result.asi8 % unit
192+
assert (mod == 0).all(), "floor not a {} multiple".format(round_freq)
193+
assert (0 <= diff).all() and (diff < unit).all(), "floor error"
194+
195+
# test ceil
196+
result = dt.ceil(round_freq)
197+
diff = result.asi8 - dt.asi8
198+
mod = result.asi8 % unit
199+
assert (mod == 0).all(), "ceil not a {} multiple".format(round_freq)
200+
assert (0 <= diff).all() and (diff < unit).all(), "ceil error"
201+
202+
# test round
203+
result = dt.round(round_freq)
204+
diff = abs(result.asi8 - dt.asi8)
205+
mod = result.asi8 % unit
206+
assert (mod == 0).all(), "round not a {} multiple".format(round_freq)
207+
assert (diff <= unit // 2).all(), "round error"
208+
if unit % 2 == 0:
209+
assert (
210+
result.asi8[diff == unit // 2] % 2 == 0
211+
).all(), "round half to even error"
212+
172213
# ----------------------------------------------------------------
173214
# DatetimeIndex.normalize
174215

pandas/tests/scalar/timestamp/test_unary_ops.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas._libs.tslibs import conversion
1414
from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG
1515
from pandas import Timestamp, NaT
16+
from pandas.tseries.frequencies import to_offset
1617

1718

1819
class TestTimestampUnaryOps(object):
@@ -70,7 +71,7 @@ def test_round_subsecond(self):
7071
assert result == expected
7172

7273
def test_round_nonstandard_freq(self):
73-
with tm.assert_produces_warning():
74+
with tm.assert_produces_warning(False):
7475
Timestamp('2016-10-17 12:00:00.001501031').round('1010ns')
7576

7677
def test_round_invalid_arg(self):
@@ -154,6 +155,46 @@ def test_round_dst_border(self, method):
154155
with pytest.raises(pytz.AmbiguousTimeError):
155156
getattr(ts, method)('H', ambiguous='raise')
156157

158+
@pytest.mark.parametrize('timestamp', [
159+
'2018-01-01 0:0:0.124999360',
160+
'2018-01-01 0:0:0.125000367',
161+
'2018-01-01 0:0:0.125500',
162+
'2018-01-01 0:0:0.126500',
163+
'2018-01-01 12:00:00',
164+
'2019-01-01 12:00:00',
165+
])
166+
@pytest.mark.parametrize('freq', [
167+
'2ns', '3ns', '4ns', '5ns', '6ns', '7ns',
168+
'250ns', '500ns', '750ns',
169+
'1us', '19us', '250us', '500us', '750us',
170+
'1s', '2s', '3s',
171+
'1D',
172+
])
173+
def test_round_int64(self, timestamp, freq):
174+
"""check that all rounding modes are accurate to int64 precision
175+
see GH#22591
176+
"""
177+
dt = Timestamp(timestamp)
178+
unit = to_offset(freq).nanos
179+
180+
# test floor
181+
result = dt.floor(freq)
182+
assert result.value % unit == 0, "floor not a {} multiple".format(freq)
183+
assert 0 <= dt.value - result.value < unit, "floor error"
184+
185+
# test ceil
186+
result = dt.ceil(freq)
187+
assert result.value % unit == 0, "ceil not a {} multiple".format(freq)
188+
assert 0 <= result.value - dt.value < unit, "ceil error"
189+
190+
# test round
191+
result = dt.round(freq)
192+
assert result.value % unit == 0, "round not a {} multiple".format(freq)
193+
assert abs(result.value - dt.value) <= unit // 2, "round error"
194+
if unit % 2 == 0 and abs(result.value - dt.value) == unit // 2:
195+
# round half to even
196+
assert result.value // unit % 2 == 0, "round half to even error"
197+
157198
# --------------------------------------------------------------
158199
# Timestamp.replace
159200

0 commit comments

Comments
 (0)