Skip to content

Commit 7c3f662

Browse files
authored
BUG: Fix a bug in 'timedelta_range' that produced an extra point on a edge case (fix #30353) (#33498)
1 parent e0eafaa commit 7c3f662

File tree

8 files changed

+117
-118
lines changed

8 files changed

+117
-118
lines changed

doc/source/whatsnew/v1.1.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,9 @@ Timedelta
574574
- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`)
575575
- :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`)
576576
- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`)
577+
- Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`)
578+
- Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`)
579+
- Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`)
577580

578581
Timezones
579582
^^^^^^^^^

pandas/core/arrays/_ranges.py

+47-60
Original file line numberDiff line numberDiff line change
@@ -3,84 +3,71 @@
33
(and possibly TimedeltaArray/PeriodArray)
44
"""
55

6-
from typing import Tuple
6+
from typing import Union
77

88
import numpy as np
99

10-
from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
10+
from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp
1111

12-
from pandas.tseries.offsets import DateOffset, Tick, generate_range
12+
from pandas.tseries.offsets import DateOffset
1313

1414

1515
def generate_regular_range(
16-
start: Timestamp, end: Timestamp, periods: int, freq: DateOffset
17-
) -> Tuple[np.ndarray, str]:
16+
start: Union[Timestamp, Timedelta],
17+
end: Union[Timestamp, Timedelta],
18+
periods: int,
19+
freq: DateOffset,
20+
):
1821
"""
19-
Generate a range of dates with the spans between dates described by
20-
the given `freq` DateOffset.
22+
Generate a range of dates or timestamps with the spans between dates
23+
described by the given `freq` DateOffset.
2124
2225
Parameters
2326
----------
24-
start : Timestamp or None
25-
first point of produced date range
26-
end : Timestamp or None
27-
last point of produced date range
27+
start : Timedelta, Timestamp or None
28+
First point of produced date range.
29+
end : Timedelta, Timestamp or None
30+
Last point of produced date range.
2831
periods : int
29-
number of periods in produced date range
30-
freq : DateOffset
31-
describes space between dates in produced date range
32+
Number of periods in produced date range.
33+
freq : Tick
34+
Describes space between dates in produced date range.
3235
3336
Returns
3437
-------
35-
ndarray[np.int64] representing nanosecond unix timestamps
38+
ndarray[np.int64] Representing nanoseconds.
3639
"""
37-
if isinstance(freq, Tick):
38-
stride = freq.nanos
39-
if periods is None:
40-
b = Timestamp(start).value
41-
# cannot just use e = Timestamp(end) + 1 because arange breaks when
42-
# stride is too large, see GH10887
43-
e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1
44-
# end.tz == start.tz by this point due to _generate implementation
45-
tz = start.tz
46-
elif start is not None:
47-
b = Timestamp(start).value
48-
e = _generate_range_overflow_safe(b, periods, stride, side="start")
49-
tz = start.tz
50-
elif end is not None:
51-
e = Timestamp(end).value + stride
52-
b = _generate_range_overflow_safe(e, periods, stride, side="end")
53-
tz = end.tz
54-
else:
55-
raise ValueError(
56-
"at least 'start' or 'end' should be specified "
57-
"if a 'period' is given."
58-
)
59-
60-
with np.errstate(over="raise"):
61-
# If the range is sufficiently large, np.arange may overflow
62-
# and incorrectly return an empty array if not caught.
63-
try:
64-
values = np.arange(b, e, stride, dtype=np.int64)
65-
except FloatingPointError:
66-
xdr = [b]
67-
while xdr[-1] != e:
68-
xdr.append(xdr[-1] + stride)
69-
values = np.array(xdr[:-1], dtype=np.int64)
70-
40+
start = start.value if start is not None else None
41+
end = end.value if end is not None else None
42+
stride = freq.nanos
43+
44+
if periods is None:
45+
b = start
46+
# cannot just use e = Timestamp(end) + 1 because arange breaks when
47+
# stride is too large, see GH10887
48+
e = b + (end - b) // stride * stride + stride // 2 + 1
49+
elif start is not None:
50+
b = start
51+
e = _generate_range_overflow_safe(b, periods, stride, side="start")
52+
elif end is not None:
53+
e = end + stride
54+
b = _generate_range_overflow_safe(e, periods, stride, side="end")
7155
else:
72-
tz = None
73-
# start and end should have the same timezone by this point
74-
if start is not None:
75-
tz = start.tz
76-
elif end is not None:
77-
tz = end.tz
78-
79-
xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
80-
81-
values = np.array([x.value for x in xdr], dtype=np.int64)
56+
raise ValueError(
57+
"at least 'start' or 'end' should be specified if a 'period' is given."
58+
)
8259

83-
return values, tz
60+
with np.errstate(over="raise"):
61+
# If the range is sufficiently large, np.arange may overflow
62+
# and incorrectly return an empty array if not caught.
63+
try:
64+
values = np.arange(b, e, stride, dtype=np.int64)
65+
except FloatingPointError:
66+
xdr = [b]
67+
while xdr[-1] != e:
68+
xdr.append(xdr[-1] + stride)
69+
values = np.array(xdr[:-1], dtype=np.int64)
70+
return values
8471

8572

8673
def _generate_range_overflow_safe(

pandas/core/arrays/datetimes.py

+12-17
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
import pandas.core.common as com
4949

5050
from pandas.tseries.frequencies import get_period_alias, to_offset
51-
from pandas.tseries.offsets import Day, Tick
51+
from pandas.tseries.offsets import Day, Tick, generate_range
5252

5353
_midnight = time(0, 0)
5454

@@ -370,33 +370,22 @@ def _generate_range(
370370
if end is not None:
371371
end = Timestamp(end)
372372

373-
if start is None and end is None:
374-
if closed is not None:
375-
raise ValueError(
376-
"Closed has to be None if not both of start and end are defined"
377-
)
378373
if start is NaT or end is NaT:
379374
raise ValueError("Neither `start` nor `end` can be NaT")
380375

381376
left_closed, right_closed = dtl.validate_endpoints(closed)
382-
383377
start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize)
384-
385378
tz = _infer_tz_from_endpoints(start, end, tz)
386379

387380
if tz is not None:
388381
# Localize the start and end arguments
382+
start_tz = None if start is None else start.tz
383+
end_tz = None if end is None else end.tz
389384
start = _maybe_localize_point(
390-
start,
391-
getattr(start, "tz", None),
392-
start,
393-
freq,
394-
tz,
395-
ambiguous,
396-
nonexistent,
385+
start, start_tz, start, freq, tz, ambiguous, nonexistent
397386
)
398387
end = _maybe_localize_point(
399-
end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent
388+
end, end_tz, end, freq, tz, ambiguous, nonexistent
400389
)
401390
if freq is not None:
402391
# We break Day arithmetic (fixed 24 hour) here and opt for
@@ -408,7 +397,13 @@ def _generate_range(
408397
if end is not None:
409398
end = end.tz_localize(None)
410399

411-
values, _tz = generate_regular_range(start, end, periods, freq)
400+
if isinstance(freq, Tick):
401+
values = generate_regular_range(start, end, periods, freq)
402+
else:
403+
xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
404+
values = np.array([x.value for x in xdr], dtype=np.int64)
405+
406+
_tz = start.tz if start is not None else end.tz
412407
index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz))
413408

414409
if tz is not None and index.tz is None:

pandas/core/arrays/timedeltas.py

+2-28
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from pandas.core import nanops
3434
from pandas.core.algorithms import checked_add_with_arr
3535
from pandas.core.arrays import datetimelike as dtl
36+
from pandas.core.arrays._ranges import generate_regular_range
3637
import pandas.core.common as com
3738
from pandas.core.construction import extract_array
3839
from pandas.core.ops.common import unpack_zerodim_and_defer
@@ -255,16 +256,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
255256
if end is not None:
256257
end = Timedelta(end)
257258

258-
if start is None and end is None:
259-
if closed is not None:
260-
raise ValueError(
261-
"Closed has to be None if not both of start and end are defined"
262-
)
263-
264259
left_closed, right_closed = dtl.validate_endpoints(closed)
265260

266261
if freq is not None:
267-
index = _generate_regular_range(start, end, periods, freq)
262+
index = generate_regular_range(start, end, periods, freq)
268263
else:
269264
index = np.linspace(start.value, end.value, periods).astype("i8")
270265
if len(index) >= 2:
@@ -1048,24 +1043,3 @@ def _validate_td64_dtype(dtype):
10481043
raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")
10491044

10501045
return dtype
1051-
1052-
1053-
def _generate_regular_range(start, end, periods, offset):
1054-
stride = offset.nanos
1055-
if periods is None:
1056-
b = Timedelta(start).value
1057-
e = Timedelta(end).value
1058-
e += stride - e % stride
1059-
elif start is not None:
1060-
b = Timedelta(start).value
1061-
e = b + periods * stride
1062-
elif end is not None:
1063-
e = Timedelta(end).value + stride
1064-
b = e - periods * stride
1065-
else:
1066-
raise ValueError(
1067-
"at least 'start' or 'end' should be specified if a 'period' is given."
1068-
)
1069-
1070-
data = np.arange(b, e, stride, dtype=np.int64)
1071-
return data

pandas/core/resample.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1499,9 +1499,12 @@ def _get_time_delta_bins(self, ax):
14991499
end_stamps = labels + self.freq
15001500
bins = ax.searchsorted(end_stamps, side="left")
15011501

1502-
# Addresses GH #10530
15031502
if self.base > 0:
1503+
# GH #10530
15041504
labels += type(self.freq)(self.base)
1505+
if self.loffset:
1506+
# GH #33498
1507+
labels += self.loffset
15051508

15061509
return binner, bins, labels
15071510

pandas/tests/indexes/timedeltas/test_timedelta_range.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pytest
33

4-
from pandas import timedelta_range, to_timedelta
4+
from pandas import Timedelta, timedelta_range, to_timedelta
55
import pandas._testing as tm
66

77
from pandas.tseries.offsets import Day, Second
@@ -61,3 +61,21 @@ def test_errors(self):
6161
# too many params
6262
with pytest.raises(ValueError, match=msg):
6363
timedelta_range(start="0 days", end="5 days", periods=10, freq="H")
64+
65+
@pytest.mark.parametrize(
66+
"start, end, freq, expected_periods",
67+
[
68+
("1D", "10D", "2D", (10 - 1) // 2 + 1),
69+
("2D", "30D", "3D", (30 - 2) // 3 + 1),
70+
("2s", "50s", "5s", (50 - 2) // 5 + 1),
71+
# tests that worked before GH 33498:
72+
("4D", "16D", "3D", (16 - 4) // 3 + 1),
73+
("8D", "16D", "40s", (16 * 3600 * 24 - 8 * 3600 * 24) // 40 + 1),
74+
],
75+
)
76+
def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_periods):
77+
# GH 33498 only the cases where `(end % freq) == 0` used to fail
78+
res = timedelta_range(start=start, end=end, freq=freq)
79+
assert Timedelta(start) == res[0]
80+
assert Timedelta(end) >= res[-1]
81+
assert len(res) == expected_periods

pandas/tests/resample/test_base.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas.core.groupby.grouper import Grouper
1111
from pandas.core.indexes.datetimes import date_range
1212
from pandas.core.indexes.period import PeriodIndex, period_range
13-
from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
13+
from pandas.core.indexes.timedeltas import timedelta_range
1414
from pandas.core.resample import _asfreq_compat
1515

1616
# a fixture value can be overridden by the test parameter value. Note that the
@@ -182,7 +182,6 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti):
182182
@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0))
183183
@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"])
184184
def test_resample_empty_dtypes(index, dtype, resample_method):
185-
186185
# Empty series were sometimes causing a segfault (for the functions
187186
# with Cython bounds-checking disabled) or an IndexError. We just run
188187
# them to ensure they no longer do. (GH #10228)
@@ -215,13 +214,7 @@ def test_resample_loffset_arg_type(frame, create_index, arg):
215214
if isinstance(arg, list):
216215
expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
217216

218-
# GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex
219-
if isinstance(expected.index, TimedeltaIndex):
220-
msg = "DataFrame are different"
221-
with pytest.raises(AssertionError, match=msg):
222-
tm.assert_frame_equal(result_agg, expected)
223-
else:
224-
tm.assert_frame_equal(result_agg, expected)
217+
tm.assert_frame_equal(result_agg, expected)
225218

226219

227220
@all_ts

pandas/tests/resample/test_timedelta.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import timedelta
22

33
import numpy as np
4+
import pytest
45

56
import pandas as pd
67
from pandas import DataFrame, Series
@@ -114,14 +115,39 @@ def test_resample_timedelta_values():
114115
# check that timedelta dtype is preserved when NaT values are
115116
# introduced by the resampling
116117

117-
times = timedelta_range("1 day", "4 day", freq="4D")
118+
times = timedelta_range("1 day", "6 day", freq="4D")
118119
df = DataFrame({"time": times}, index=times)
119120

120-
times2 = timedelta_range("1 day", "4 day", freq="2D")
121+
times2 = timedelta_range("1 day", "6 day", freq="2D")
121122
exp = Series(times2, index=times2, name="time")
122123
exp.iloc[1] = pd.NaT
123124

124125
res = df.resample("2D").first()["time"]
125126
tm.assert_series_equal(res, exp)
126127
res = df["time"].resample("2D").first()
127128
tm.assert_series_equal(res, exp)
129+
130+
131+
@pytest.mark.parametrize(
132+
"start, end, freq, resample_freq",
133+
[
134+
("8H", "21h59min50s", "10S", "3H"), # GH 30353 example
135+
("3H", "22H", "1H", "5H"),
136+
("527D", "5006D", "3D", "10D"),
137+
("1D", "10D", "1D", "2D"), # GH 13022 example
138+
# tests that worked before GH 33498:
139+
("8H", "21h59min50s", "10S", "2H"),
140+
("0H", "21h59min50s", "10S", "3H"),
141+
("10D", "85D", "D", "2D"),
142+
],
143+
)
144+
def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
145+
# GH 33498
146+
# check that the timedelta bins does not contains an extra bin
147+
idx = pd.timedelta_range(start=start, end=end, freq=freq)
148+
s = pd.Series(np.arange(len(idx)), index=idx)
149+
result = s.resample(resample_freq).min()
150+
expected_index = pd.timedelta_range(freq=resample_freq, start=start, end=end)
151+
tm.assert_index_equal(result.index, expected_index)
152+
assert result.index.freq == expected_index.freq
153+
assert not np.isnan(result[-1])

0 commit comments

Comments
 (0)