-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Fix Timestamp.round errors #22802
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Timestamp.round errors #22802
Changes from 5 commits
fb25c21
a8ded73
eee47a6
f6f131b
53c8c9b
988e5e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ cimport ccalendar | |
from conversion import tz_localize_to_utc, normalize_i8_timestamps | ||
from conversion cimport (tz_convert_single, _TSObject, | ||
convert_to_tsobject, convert_datetime_to_tsobject) | ||
import enum | ||
from fields import get_start_end_field, get_date_name_field | ||
from nattype import NaT | ||
from nattype cimport NPY_NAT | ||
|
@@ -57,50 +58,113 @@ cdef inline object create_timestamp_from_ts(int64_t value, | |
return ts_base | ||
|
||
|
||
def round_ns(values, rounder, freq): | ||
@enum.unique | ||
class RoundTo(enum.Enum): | ||
""" | ||
Applies rounding function at given frequency | ||
enumeration defining the available rounding modes | ||
|
||
Attributes | ||
---------- | ||
MINUS_INFTY | ||
round towards -∞, or floor [2]_ | ||
PLUS_INFTY | ||
round towards +∞, or ceil [3]_ | ||
NEAREST_HALF_EVEN | ||
round to nearest, tie-break half to even [6]_ | ||
NEAREST_HALF_MINUS_INFTY | ||
round to nearest, tie-break half to -∞ [5]_ | ||
NEAREST_HALF_PLUS_INFTY | ||
round to nearest, tie-break half to +∞ [4]_ | ||
|
||
|
||
References | ||
---------- | ||
.. [1] "Rounding - Wikipedia" | ||
https://en.wikipedia.org/wiki/Rounding | ||
.. [2] "Rounding down" | ||
https://en.wikipedia.org/wiki/Rounding#Rounding_down | ||
.. [3] "Rounding up" | ||
https://en.wikipedia.org/wiki/Rounding#Rounding_up | ||
.. [4] "Round half up" | ||
https://en.wikipedia.org/wiki/Rounding#Round_half_up | ||
.. [5] "Round half down" | ||
https://en.wikipedia.org/wiki/Rounding#Round_half_down | ||
.. [6] "Round half to even" | ||
https://en.wikipedia.org/wiki/Rounding#Round_half_to_even | ||
""" | ||
MINUS_INFTY = 0 | ||
PLUS_INFTY = 1 | ||
NEAREST_HALF_EVEN = 2 | ||
NEAREST_HALF_PLUS_INFTY = 3 | ||
NEAREST_HALF_MINUS_INFTY = 4 | ||
|
||
|
||
cdef inline _npdivmod(x1, x2): | ||
"""implement divmod for numpy < 1.13""" | ||
return np.floor_divide(x1, x2), np.remainder(x1, x2) | ||
|
||
|
||
try: | ||
from numpy import divmod as npdivmod | ||
except ImportError: | ||
npdivmod = _npdivmod | ||
|
||
|
||
cdef inline _floor_int64(values, unit): | ||
return values - np.remainder(values, unit) | ||
|
||
cdef inline _ceil_int64(values, unit): | ||
return values + np.remainder(-values, unit) | ||
|
||
cdef inline _rounddown_int64(values, unit): | ||
return _ceil_int64(values - unit//2, unit) | ||
|
||
cdef inline _roundup_int64(values, unit): | ||
return _floor_int64(values + unit//2, unit) | ||
|
||
|
||
def round_nsint64(values, mode, freq): | ||
""" | ||
Applies rounding mode at given frequency | ||
|
||
Parameters | ||
---------- | ||
values : :obj:`ndarray` | ||
rounder : function, eg. 'ceil', 'floor', 'round' | ||
mode : instance of `RoundTo` enumeration | ||
freq : str, obj | ||
|
||
Returns | ||
------- | ||
:obj:`ndarray` | ||
""" | ||
|
||
if not isinstance(mode, RoundTo): | ||
raise ValueError('mode should be a RoundTo member') | ||
miccoli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
unit = to_offset(freq).nanos | ||
|
||
# GH21262 If the Timestamp is multiple of the freq str | ||
# don't apply any rounding | ||
mask = values % unit == 0 | ||
if mask.all(): | ||
return values | ||
r = values.copy() | ||
|
||
if unit < 1000: | ||
# for nano rounding, work with the last 6 digits separately | ||
# due to float precision | ||
buff = 1000000 | ||
r[~mask] = (buff * (values[~mask] // buff) + | ||
unit * (rounder((values[~mask] % buff) * | ||
(1 / float(unit)))).astype('i8')) | ||
else: | ||
if unit % 1000 != 0: | ||
msg = 'Precision will be lost using frequency: {}' | ||
warnings.warn(msg.format(freq)) | ||
# GH19206 | ||
# to deal with round-off when unit is large | ||
if unit >= 1e9: | ||
divisor = 10 ** int(np.log10(unit / 1e7)) | ||
else: | ||
divisor = 10 | ||
r[~mask] = (unit * rounder((values[~mask] * | ||
(divisor / float(unit))) / divisor) | ||
.astype('i8')) | ||
return r | ||
if mode is RoundTo.MINUS_INFTY: | ||
miccoli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return _floor_int64(values, unit) | ||
elif mode is RoundTo.PLUS_INFTY: | ||
return _ceil_int64(values, unit) | ||
elif mode is RoundTo.NEAREST_HALF_MINUS_INFTY: | ||
miccoli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return _rounddown_int64(values, unit) | ||
elif mode is RoundTo.NEAREST_HALF_PLUS_INFTY: | ||
return _roundup_int64(values, unit) | ||
elif mode is RoundTo.NEAREST_HALF_EVEN: | ||
# for odd unit there is no need of a tie break | ||
if unit % 2: | ||
return _rounddown_int64(values, unit) | ||
quotient, remainder = npdivmod(values, unit) | ||
mask = np.logical_or( | ||
remainder > (unit // 2), | ||
np.logical_and(remainder == (unit // 2), quotient % 2) | ||
) | ||
quotient[mask] += 1 | ||
return quotient * unit | ||
|
||
# this should be unreachable | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could say undefined mode |
||
assert False, "should never arrive here" | ||
|
||
|
||
# This is PITA. Because we inherit from datetime, which has very specific | ||
|
@@ -656,7 +720,7 @@ class Timestamp(_Timestamp): | |
|
||
return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) | ||
|
||
def _round(self, freq, rounder, ambiguous='raise'): | ||
def _round(self, freq, mode, ambiguous='raise'): | ||
if self.tz is not None: | ||
value = self.tz_localize(None).value | ||
else: | ||
|
@@ -665,7 +729,7 @@ class Timestamp(_Timestamp): | |
value = np.array([value], dtype=np.int64) | ||
|
||
# Will only ever contain 1 element for timestamp | ||
r = round_ns(value, rounder, freq)[0] | ||
r = round_nsint64(value, mode, freq)[0] | ||
result = Timestamp(r, unit='ns') | ||
if self.tz is not None: | ||
result = result.tz_localize(self.tz, ambiguous=ambiguous) | ||
|
@@ -694,7 +758,7 @@ class Timestamp(_Timestamp): | |
------ | ||
ValueError if the freq cannot be converted | ||
""" | ||
return self._round(freq, np.round, ambiguous) | ||
return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous) | ||
|
||
def floor(self, freq, ambiguous='raise'): | ||
""" | ||
|
@@ -715,7 +779,7 @@ class Timestamp(_Timestamp): | |
------ | ||
ValueError if the freq cannot be converted | ||
""" | ||
return self._round(freq, np.floor, ambiguous) | ||
return self._round(freq, RoundTo.MINUS_INFTY, ambiguous) | ||
|
||
def ceil(self, freq, ambiguous='raise'): | ||
""" | ||
|
@@ -736,7 +800,7 @@ class Timestamp(_Timestamp): | |
------ | ||
ValueError if the freq cannot be converted | ||
""" | ||
return self._round(freq, np.ceil, ambiguous) | ||
return self._round(freq, RoundTo.PLUS_INFTY, ambiguous) | ||
|
||
@property | ||
def tz(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
from pandas._libs.tslibs import conversion | ||
from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG | ||
from pandas import Timestamp, NaT | ||
from pandas.tseries.frequencies import to_offset | ||
|
||
|
||
class TestTimestampUnaryOps(object): | ||
|
@@ -70,7 +71,7 @@ def test_round_subsecond(self): | |
assert result == expected | ||
|
||
def test_round_nonstandard_freq(self): | ||
with tm.assert_produces_warning(): | ||
with tm.assert_produces_warning(False): | ||
Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') | ||
|
||
def test_round_invalid_arg(self): | ||
|
@@ -154,6 +155,46 @@ def test_round_dst_border(self, method): | |
with pytest.raises(pytz.AmbiguousTimeError): | ||
getattr(ts, method)('H', ambiguous='raise') | ||
|
||
@pytest.mark.parametrize('timestamp', [ | ||
'2018-01-01 0:0:0.124999360', | ||
'2018-01-01 0:0:0.125000367', | ||
'2018-01-01 0:0:0.125500', | ||
'2018-01-01 0:0:0.126500', | ||
'2018-01-01 12:00:00', | ||
'2019-01-01 12:00:00', | ||
]) | ||
@pytest.mark.parametrize('freq', [ | ||
'2ns', '3ns', '4ns', '5ns', '6ns', '7ns', | ||
'250ns', '500ns', '750ns', | ||
'1us', '19us', '250us', '500us', '750us', | ||
'1s', '2s', '3s', | ||
'1D', | ||
]) | ||
def test_round_int64(self, timestamp, freq): | ||
"""check that all rounding modes are accurate to int64 precision | ||
see GH#22591 | ||
""" | ||
dt = Timestamp(timestamp) | ||
unit = to_offset(freq).nanos | ||
|
||
# test floor | ||
result = dt.floor(freq) | ||
assert result.value % unit == 0, "floor not a {} multiple".format(freq) | ||
assert 0 <= dt.value - result.value < unit, "floor error" | ||
|
||
# test ceil | ||
result = dt.ceil(freq) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there round errors in timedelta ops as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, also BTW I found #22591 processing real scientific data, in which I had timestamps for 2018 measurements with about 1 µs resolution. On the contrary I cannot imagine of an application in which you have 50 years time deltas with such a small resolution. Of course you could create a round-trip example:
which gives different results with respect to
Nevertheless I would suggest to postpone the resolution of the timedelta issue after this PR is merged. (Please note also that the current complex machinery, which tries to improve rounding errors, is implemented only for Timedate and not for Timedelta.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no i get it, that's why I asked. Ok to fix later, esp if its not very common. |
||
assert result.value % unit == 0, "ceil not a {} multiple".format(freq) | ||
assert 0 <= result.value - dt.value < unit, "ceil error" | ||
|
||
# test round | ||
result = dt.round(freq) | ||
assert result.value % unit == 0, "round not a {} multiple".format(freq) | ||
assert abs(result.value - dt.value) <= unit // 2, "round error" | ||
if unit % 2 == 0 and abs(result.value - dt.value) == unit // 2: | ||
# round half to even | ||
assert result.value // unit % 2 == 0, "round half to even error" | ||
|
||
# -------------------------------------------------------------- | ||
# Timestamp.replace | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
raise -> rise
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks for catching this typo