Skip to content

BUG: Fix a bug in 'timedelta_range' that produced an extra point on a edge case (fix #30353) #33498

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,9 @@ Timedelta
- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`)
- :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`)
- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`)
- Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`)
- Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`)
- Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`)
Comment on lines +577 to +579
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope this is okay @WillAyd 🙂. I'll be happy to change this if you disagree with this changelog.

BTW, regarding loffset, as mentioned earlier (https://github.com/pandas-dev/pandas/pull/33498/files#r407231964):

Useful because otherwise tests/resample/test_base.py::test_resample_loffset_arg_type was always failing.
However, this piece of code is questionable since #31809 will deprecate loffset anyway.


Timezones
^^^^^^^^^
Expand Down
107 changes: 47 additions & 60 deletions pandas/core/arrays/_ranges.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,84 +3,71 @@
(and possibly TimedeltaArray/PeriodArray)
"""

from typing import Tuple
from typing import Union

import numpy as np

from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp

from pandas.tseries.offsets import DateOffset, Tick, generate_range
from pandas.tseries.offsets import DateOffset


def generate_regular_range(
start: Timestamp, end: Timestamp, periods: int, freq: DateOffset
) -> Tuple[np.ndarray, str]:
start: Union[Timestamp, Timedelta],
end: Union[Timestamp, Timedelta],
periods: int,
freq: DateOffset,
):
"""
Generate a range of dates with the spans between dates described by
the given `freq` DateOffset.
Generate a range of dates or timestamps with the spans between dates
described by the given `freq` DateOffset.

Parameters
----------
start : Timestamp or None
first point of produced date range
end : Timestamp or None
last point of produced date range
start : Timedelta, Timestamp or None
First point of produced date range.
end : Timedelta, Timestamp or None
Last point of produced date range.
periods : int
number of periods in produced date range
freq : DateOffset
describes space between dates in produced date range
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.

Returns
-------
ndarray[np.int64] representing nanosecond unix timestamps
ndarray[np.int64] Representing nanoseconds.
"""
if isinstance(freq, Tick):
stride = freq.nanos
if periods is None:
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1
# end.tz == start.tz by this point due to _generate implementation
tz = start.tz
elif start is not None:
b = Timestamp(start).value
e = _generate_range_overflow_safe(b, periods, stride, side="start")
tz = start.tz
elif end is not None:
e = Timestamp(end).value + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
tz = end.tz
else:
raise ValueError(
"at least 'start' or 'end' should be specified "
"if a 'period' is given."
)

with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)

start = start.value if start is not None else None
end = end.value if end is not None else None
stride = freq.nanos

if periods is None:
b = start
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = b + (end - b) // stride * stride + stride // 2 + 1
elif start is not None:
b = start
e = _generate_range_overflow_safe(b, periods, stride, side="start")
elif end is not None:
e = end + stride
b = _generate_range_overflow_safe(e, periods, stride, side="end")
else:
tz = None
# start and end should have the same timezone by this point
if start is not None:
tz = start.tz
elif end is not None:
tz = end.tz

xdr = generate_range(start=start, end=end, periods=periods, offset=freq)

values = np.array([x.value for x in xdr], dtype=np.int64)
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)

return values, tz
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)
return values


def _generate_range_overflow_safe(
Expand Down
29 changes: 12 additions & 17 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import pandas.core.common as com

from pandas.tseries.frequencies import get_period_alias, to_offset
from pandas.tseries.offsets import Day, Tick
from pandas.tseries.offsets import Day, Tick, generate_range

_midnight = time(0, 0)

Expand Down Expand Up @@ -370,33 +370,22 @@ def _generate_range(
if end is not None:
end = Timestamp(end)

if start is None and end is None:
if closed is not None:
raise ValueError(
"Closed has to be None if not both of start and end are defined"
)
if start is NaT or end is NaT:
raise ValueError("Neither `start` nor `end` can be NaT")

left_closed, right_closed = dtl.validate_endpoints(closed)

start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize)

tz = _infer_tz_from_endpoints(start, end, tz)

if tz is not None:
# Localize the start and end arguments
start_tz = None if start is None else start.tz
end_tz = None if end is None else end.tz
start = _maybe_localize_point(
start,
getattr(start, "tz", None),
start,
freq,
tz,
ambiguous,
nonexistent,
start, start_tz, start, freq, tz, ambiguous, nonexistent
)
end = _maybe_localize_point(
end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent
end, end_tz, end, freq, tz, ambiguous, nonexistent
)
if freq is not None:
# We break Day arithmetic (fixed 24 hour) here and opt for
Expand All @@ -408,7 +397,13 @@ def _generate_range(
if end is not None:
end = end.tz_localize(None)

values, _tz = generate_regular_range(start, end, periods, freq)
if isinstance(freq, Tick):
values = generate_regular_range(start, end, periods, freq)
else:
xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
values = np.array([x.value for x in xdr], dtype=np.int64)

_tz = start.tz if start is not None else end.tz
index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz))

if tz is not None and index.tz is None:
Expand Down
30 changes: 2 additions & 28 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from pandas.core import nanops
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays._ranges import generate_regular_range
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -255,16 +256,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
if end is not None:
end = Timedelta(end)

if start is None and end is None:
if closed is not None:
raise ValueError(
"Closed has to be None if not both of start and end are defined"
)

left_closed, right_closed = dtl.validate_endpoints(closed)

if freq is not None:
index = _generate_regular_range(start, end, periods, freq)
index = generate_regular_range(start, end, periods, freq)
else:
index = np.linspace(start.value, end.value, periods).astype("i8")
if len(index) >= 2:
Expand Down Expand Up @@ -1048,24 +1043,3 @@ def _validate_td64_dtype(dtype):
raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")

return dtype


def _generate_regular_range(start, end, periods, offset):
stride = offset.nanos
if periods is None:
b = Timedelta(start).value
e = Timedelta(end).value
e += stride - e % stride
elif start is not None:
b = Timedelta(start).value
e = b + periods * stride
elif end is not None:
e = Timedelta(end).value + stride
b = e - periods * stride
else:
raise ValueError(
"at least 'start' or 'end' should be specified if a 'period' is given."
)

data = np.arange(b, e, stride, dtype=np.int64)
return data
5 changes: 4 additions & 1 deletion pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,9 +1499,12 @@ def _get_time_delta_bins(self, ax):
end_stamps = labels + self.freq
bins = ax.searchsorted(end_stamps, side="left")

# Addresses GH #10530
if self.base > 0:
# GH #10530
labels += type(self.freq)(self.base)
if self.loffset:
# GH #33498
labels += self.loffset

return binner, bins, labels

Expand Down
20 changes: 19 additions & 1 deletion pandas/tests/indexes/timedeltas/test_timedelta_range.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pytest

from pandas import timedelta_range, to_timedelta
from pandas import Timedelta, timedelta_range, to_timedelta
import pandas._testing as tm

from pandas.tseries.offsets import Day, Second
Expand Down Expand Up @@ -61,3 +61,21 @@ def test_errors(self):
# too many params
with pytest.raises(ValueError, match=msg):
timedelta_range(start="0 days", end="5 days", periods=10, freq="H")

@pytest.mark.parametrize(
"start, end, freq, expected_periods",
[
("1D", "10D", "2D", (10 - 1) // 2 + 1),
("2D", "30D", "3D", (30 - 2) // 3 + 1),
("2s", "50s", "5s", (50 - 2) // 5 + 1),
# tests that worked before GH 33498:
("4D", "16D", "3D", (16 - 4) // 3 + 1),
("8D", "16D", "40s", (16 * 3600 * 24 - 8 * 3600 * 24) // 40 + 1),
],
)
def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_periods):
# GH 33498 only the cases where `(end % freq) == 0` used to fail
res = timedelta_range(start=start, end=end, freq=freq)
assert Timedelta(start) == res[0]
assert Timedelta(end) >= res[-1]
assert len(res) == expected_periods
11 changes: 2 additions & 9 deletions pandas/tests/resample/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas.core.groupby.grouper import Grouper
from pandas.core.indexes.datetimes import date_range
from pandas.core.indexes.period import PeriodIndex, period_range
from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.resample import _asfreq_compat

# a fixture value can be overridden by the test parameter value. Note that the
Expand Down Expand Up @@ -182,7 +182,6 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti):
@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0))
@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"])
def test_resample_empty_dtypes(index, dtype, resample_method):

# Empty series were sometimes causing a segfault (for the functions
# with Cython bounds-checking disabled) or an IndexError. We just run
# them to ensure they no longer do. (GH #10228)
Expand Down Expand Up @@ -215,13 +214,7 @@ def test_resample_loffset_arg_type(frame, create_index, arg):
if isinstance(arg, list):
expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])

# GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex
if isinstance(expected.index, TimedeltaIndex):
msg = "DataFrame are different"
with pytest.raises(AssertionError, match=msg):
tm.assert_frame_equal(result_agg, expected)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is testing incorrect behavior? thats pretty weird

Copy link
Member Author

@hasB4K hasB4K Apr 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel Yeah, that was the case. That was testing an incorrect behavior... 😕 And this was the case for two reasons:

Here an example of the values when setting a breakpoint with PyCharm line 222 in pandas/tests/resample/test_base.py on master:

The expected result (with loffset and without an extra bin):
expected

The actual result (without loffset and with an extra bin):
result_agg

I think that was the reason of the presence of this TODO:

# GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex

IMO, the presence of this if was just to skip the tests of Timedeltas when resampling without skipping the tests of Timestamps and Periods parametrized with @all_ts.

else:
tm.assert_frame_equal(result_agg, expected)
tm.assert_frame_equal(result_agg, expected)


@all_ts
Expand Down
30 changes: 28 additions & 2 deletions pandas/tests/resample/test_timedelta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import timedelta

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame, Series
Expand Down Expand Up @@ -114,14 +115,39 @@ def test_resample_timedelta_values():
# check that timedelta dtype is preserved when NaT values are
# introduced by the resampling

times = timedelta_range("1 day", "4 day", freq="4D")
times = timedelta_range("1 day", "6 day", freq="4D")
df = DataFrame({"time": times}, index=times)

times2 = timedelta_range("1 day", "4 day", freq="2D")
times2 = timedelta_range("1 day", "6 day", freq="2D")
exp = Series(times2, index=times2, name="time")
exp.iloc[1] = pd.NaT

res = df.resample("2D").first()["time"]
tm.assert_series_equal(res, exp)
res = df["time"].resample("2D").first()
tm.assert_series_equal(res, exp)


@pytest.mark.parametrize(
"start, end, freq, resample_freq",
[
("8H", "21h59min50s", "10S", "3H"), # GH 30353 example
("3H", "22H", "1H", "5H"),
("527D", "5006D", "3D", "10D"),
("1D", "10D", "1D", "2D"), # GH 13022 example
# tests that worked before GH 33498:
("8H", "21h59min50s", "10S", "2H"),
("0H", "21h59min50s", "10S", "3H"),
("10D", "85D", "D", "2D"),
],
)
def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
# GH 33498
# check that the timedelta bins does not contains an extra bin
idx = pd.timedelta_range(start=start, end=end, freq=freq)
s = pd.Series(np.arange(len(idx)), index=idx)
result = s.resample(resample_freq).min()
expected_index = pd.timedelta_range(freq=resample_freq, start=start, end=end)
tm.assert_index_equal(result.index, expected_index)
assert result.index.freq == expected_index.freq
assert not np.isnan(result[-1])