diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 95cb4ccbbb796..79f78471922bc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -574,6 +574,9 @@ Timedelta - Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) - Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) +- Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`) +- Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`) +- Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`) Timezones ^^^^^^^^^ diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 471bfa736d4b9..3b090ca458d88 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -3,84 +3,71 @@ (and possibly TimedeltaArray/PeriodArray) """ -from typing import Tuple +from typing import Union import numpy as np -from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp -from pandas.tseries.offsets import DateOffset, Tick, generate_range +from pandas.tseries.offsets import DateOffset def generate_regular_range( - start: Timestamp, end: Timestamp, periods: int, freq: DateOffset -) -> Tuple[np.ndarray, str]: + start: Union[Timestamp, Timedelta], + end: Union[Timestamp, Timedelta], + periods: int, + freq: DateOffset, +): """ - Generate a range of dates with the spans between dates described by - the given `freq` DateOffset. + Generate a range of dates or timestamps with the spans between dates + described by the given `freq` DateOffset. Parameters ---------- - start : Timestamp or None - first point of produced date range - end : Timestamp or None - last point of produced date range + start : Timedelta, Timestamp or None + First point of produced date range. + end : Timedelta, Timestamp or None + Last point of produced date range. periods : int - number of periods in produced date range - freq : DateOffset - describes space between dates in produced date range + Number of periods in produced date range. + freq : Tick + Describes space between dates in produced date range. Returns ------- - ndarray[np.int64] representing nanosecond unix timestamps + ndarray[np.int64] Representing nanoseconds. """ - if isinstance(freq, Tick): - stride = freq.nanos - if periods is None: - b = Timestamp(start).value - # cannot just use e = Timestamp(end) + 1 because arange breaks when - # stride is too large, see GH10887 - e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1 - # end.tz == start.tz by this point due to _generate implementation - tz = start.tz - elif start is not None: - b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side="start") - tz = start.tz - elif end is not None: - e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side="end") - tz = end.tz - else: - raise ValueError( - "at least 'start' or 'end' should be specified " - "if a 'period' is given." - ) - - with np.errstate(over="raise"): - # If the range is sufficiently large, np.arange may overflow - # and incorrectly return an empty array if not caught. - try: - values = np.arange(b, e, stride, dtype=np.int64) - except FloatingPointError: - xdr = [b] - while xdr[-1] != e: - xdr.append(xdr[-1] + stride) - values = np.array(xdr[:-1], dtype=np.int64) - + start = start.value if start is not None else None + end = end.value if end is not None else None + stride = freq.nanos + + if periods is None: + b = start + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = b + (end - b) // stride * stride + stride // 2 + 1 + elif start is not None: + b = start + e = _generate_range_overflow_safe(b, periods, stride, side="start") + elif end is not None: + e = end + stride + b = _generate_range_overflow_safe(e, periods, stride, side="end") else: - tz = None - # start and end should have the same timezone by this point - if start is not None: - tz = start.tz - elif end is not None: - tz = end.tz - - xdr = generate_range(start=start, end=end, periods=periods, offset=freq) - - values = np.array([x.value for x in xdr], dtype=np.int64) + raise ValueError( + "at least 'start' or 'end' should be specified if a 'period' is given." + ) - return values, tz + with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + return values def _generate_range_overflow_safe( diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8a1cacfe304ca..3134ffab2ea5a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -48,7 +48,7 @@ import pandas.core.common as com from pandas.tseries.frequencies import get_period_alias, to_offset -from pandas.tseries.offsets import Day, Tick +from pandas.tseries.offsets import Day, Tick, generate_range _midnight = time(0, 0) @@ -370,33 +370,22 @@ def _generate_range( if end is not None: end = Timestamp(end) - if start is None and end is None: - if closed is not None: - raise ValueError( - "Closed has to be None if not both of start and end are defined" - ) if start is NaT or end is NaT: raise ValueError("Neither `start` nor `end` can be NaT") left_closed, right_closed = dtl.validate_endpoints(closed) - start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) - tz = _infer_tz_from_endpoints(start, end, tz) if tz is not None: # Localize the start and end arguments + start_tz = None if start is None else start.tz + end_tz = None if end is None else end.tz start = _maybe_localize_point( - start, - getattr(start, "tz", None), - start, - freq, - tz, - ambiguous, - nonexistent, + start, start_tz, start, freq, tz, ambiguous, nonexistent ) end = _maybe_localize_point( - end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent + end, end_tz, end, freq, tz, ambiguous, nonexistent ) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -408,7 +397,13 @@ def _generate_range( if end is not None: end = end.tz_localize(None) - values, _tz = generate_regular_range(start, end, periods, freq) + if isinstance(freq, Tick): + values = generate_regular_range(start, end, periods, freq) + else: + xdr = generate_range(start=start, end=end, periods=periods, offset=freq) + values = np.array([x.value for x in xdr], dtype=np.int64) + + _tz = start.tz if start is not None else end.tz index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz)) if tz is not None and index.tz is None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a62f94b1a3665..8cd4b874d10ee 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -33,6 +33,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.ops.common import unpack_zerodim_and_defer @@ -255,16 +256,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): if end is not None: end = Timedelta(end) - if start is None and end is None: - if closed is not None: - raise ValueError( - "Closed has to be None if not both of start and end are defined" - ) - left_closed, right_closed = dtl.validate_endpoints(closed) if freq is not None: - index = _generate_regular_range(start, end, periods, freq) + index = generate_regular_range(start, end, periods, freq) else: index = np.linspace(start.value, end.value, periods).astype("i8") if len(index) >= 2: @@ -1048,24 +1043,3 @@ def _validate_td64_dtype(dtype): raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") return dtype - - -def _generate_regular_range(start, end, periods, offset): - stride = offset.nanos - if periods is None: - b = Timedelta(start).value - e = Timedelta(end).value - e += stride - e % stride - elif start is not None: - b = Timedelta(start).value - e = b + periods * stride - elif end is not None: - e = Timedelta(end).value + stride - b = e - periods * stride - else: - raise ValueError( - "at least 'start' or 'end' should be specified if a 'period' is given." - ) - - data = np.arange(b, e, stride, dtype=np.int64) - return data diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 06751d9c35fab..6d79ae070c103 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1499,9 +1499,12 @@ def _get_time_delta_bins(self, ax): end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side="left") - # Addresses GH #10530 if self.base > 0: + # GH #10530 labels += type(self.freq)(self.base) + if self.loffset: + # GH #33498 + labels += self.loffset return binner, bins, labels diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index c07a6471c732f..7d78fbf9ff190 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import timedelta_range, to_timedelta +from pandas import Timedelta, timedelta_range, to_timedelta import pandas._testing as tm from pandas.tseries.offsets import Day, Second @@ -61,3 +61,21 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): timedelta_range(start="0 days", end="5 days", periods=10, freq="H") + + @pytest.mark.parametrize( + "start, end, freq, expected_periods", + [ + ("1D", "10D", "2D", (10 - 1) // 2 + 1), + ("2D", "30D", "3D", (30 - 2) // 3 + 1), + ("2s", "50s", "5s", (50 - 2) // 5 + 1), + # tests that worked before GH 33498: + ("4D", "16D", "3D", (16 - 4) // 3 + 1), + ("8D", "16D", "40s", (16 * 3600 * 24 - 8 * 3600 * 24) // 40 + 1), + ], + ) + def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_periods): + # GH 33498 only the cases where `(end % freq) == 0` used to fail + res = timedelta_range(start=start, end=end, freq=freq) + assert Timedelta(start) == res[0] + assert Timedelta(end) >= res[-1] + assert len(res) == expected_periods diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 6384c5f19c898..d0559923fec51 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -10,7 +10,7 @@ from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range -from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import _asfreq_compat # a fixture value can be overridden by the test parameter value. Note that the @@ -182,7 +182,6 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) @pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): - # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) @@ -215,13 +214,7 @@ def test_resample_loffset_arg_type(frame, create_index, arg): if isinstance(arg, list): expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex - if isinstance(expected.index, TimedeltaIndex): - msg = "DataFrame are different" - with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(result_agg, expected) - else: - tm.assert_frame_equal(result_agg, expected) + tm.assert_frame_equal(result_agg, expected) @all_ts diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 9fc355a45b656..1b4a625f078c9 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -1,6 +1,7 @@ from datetime import timedelta import numpy as np +import pytest import pandas as pd from pandas import DataFrame, Series @@ -114,10 +115,10 @@ def test_resample_timedelta_values(): # check that timedelta dtype is preserved when NaT values are # introduced by the resampling - times = timedelta_range("1 day", "4 day", freq="4D") + times = timedelta_range("1 day", "6 day", freq="4D") df = DataFrame({"time": times}, index=times) - times2 = timedelta_range("1 day", "4 day", freq="2D") + times2 = timedelta_range("1 day", "6 day", freq="2D") exp = Series(times2, index=times2, name="time") exp.iloc[1] = pd.NaT @@ -125,3 +126,28 @@ def test_resample_timedelta_values(): tm.assert_series_equal(res, exp) res = df["time"].resample("2D").first() tm.assert_series_equal(res, exp) + + +@pytest.mark.parametrize( + "start, end, freq, resample_freq", + [ + ("8H", "21h59min50s", "10S", "3H"), # GH 30353 example + ("3H", "22H", "1H", "5H"), + ("527D", "5006D", "3D", "10D"), + ("1D", "10D", "1D", "2D"), # GH 13022 example + # tests that worked before GH 33498: + ("8H", "21h59min50s", "10S", "2H"), + ("0H", "21h59min50s", "10S", "3H"), + ("10D", "85D", "D", "2D"), + ], +) +def test_resample_timedelta_edge_case(start, end, freq, resample_freq): + # GH 33498 + # check that the timedelta bins does not contains an extra bin + idx = pd.timedelta_range(start=start, end=end, freq=freq) + s = pd.Series(np.arange(len(idx)), index=idx) + result = s.resample(resample_freq).min() + expected_index = pd.timedelta_range(freq=resample_freq, start=start, end=end) + tm.assert_index_equal(result.index, expected_index) + assert result.index.freq == expected_index.freq + assert not np.isnan(result[-1])