BUG: Fix a bug in 'timedelta_range' that produced an extra point on a edge case (fix #30353) (#33498)

hasB4K · web-flow · commit 7c3f662d1e37 · 2020-05-09T15:23:29.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -574,6 +574,9 @@ Timedelta
 - Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`)
 - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`)
 - Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`)
+- Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`)
+- Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`)
+- Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`)
 
 Timezones
 ^^^^^^^^^
diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py
@@ -3,84 +3,71 @@
 (and possibly TimedeltaArray/PeriodArray)
 """
 
-from typing import Tuple
+from typing import Union
 
 import numpy as np
 
-from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
+from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp
 
-from pandas.tseries.offsets import DateOffset, Tick, generate_range
+from pandas.tseries.offsets import DateOffset
 
 
 def generate_regular_range(
-    start: Timestamp, end: Timestamp, periods: int, freq: DateOffset
-) -> Tuple[np.ndarray, str]:
+    start: Union[Timestamp, Timedelta],
+    end: Union[Timestamp, Timedelta],
+    periods: int,
+    freq: DateOffset,
+):
     """
-    Generate a range of dates with the spans between dates described by
-    the given `freq` DateOffset.
+    Generate a range of dates or timestamps with the spans between dates
+    described by the given `freq` DateOffset.
 
     Parameters
     ----------
-    start : Timestamp or None
-        first point of produced date range
-    end : Timestamp or None
-        last point of produced date range
+    start : Timedelta, Timestamp or None
+        First point of produced date range.
+    end : Timedelta, Timestamp or None
+        Last point of produced date range.
     periods : int
-        number of periods in produced date range
-    freq : DateOffset
-        describes space between dates in produced date range
+        Number of periods in produced date range.
+    freq : Tick
+        Describes space between dates in produced date range.
 
     Returns
     -------
-    ndarray[np.int64] representing nanosecond unix timestamps
+    ndarray[np.int64] Representing nanoseconds.
     """
-    if isinstance(freq, Tick):
-        stride = freq.nanos
-        if periods is None:
-            b = Timestamp(start).value
-            # cannot just use e = Timestamp(end) + 1 because arange breaks when
-            # stride is too large, see GH10887
-            e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1
-            # end.tz == start.tz by this point due to _generate implementation
-            tz = start.tz
-        elif start is not None:
-            b = Timestamp(start).value
-            e = _generate_range_overflow_safe(b, periods, stride, side="start")
-            tz = start.tz
-        elif end is not None:
-            e = Timestamp(end).value + stride
-            b = _generate_range_overflow_safe(e, periods, stride, side="end")
-            tz = end.tz
-        else:
-            raise ValueError(
-                "at least 'start' or 'end' should be specified "
-                "if a 'period' is given."
-            )
-
-        with np.errstate(over="raise"):
-            # If the range is sufficiently large, np.arange may overflow
-            #  and incorrectly return an empty array if not caught.
-            try:
-                values = np.arange(b, e, stride, dtype=np.int64)
-            except FloatingPointError:
-                xdr = [b]
-                while xdr[-1] != e:
-                    xdr.append(xdr[-1] + stride)
-                values = np.array(xdr[:-1], dtype=np.int64)
-
+    start = start.value if start is not None else None
+    end = end.value if end is not None else None
+    stride = freq.nanos
+
+    if periods is None:
+        b = start
+        # cannot just use e = Timestamp(end) + 1 because arange breaks when
+        # stride is too large, see GH10887
+        e = b + (end - b) // stride * stride + stride // 2 + 1
+    elif start is not None:
+        b = start
+        e = _generate_range_overflow_safe(b, periods, stride, side="start")
+    elif end is not None:
+        e = end + stride
+        b = _generate_range_overflow_safe(e, periods, stride, side="end")
     else:
-        tz = None
-        # start and end should have the same timezone by this point
-        if start is not None:
-            tz = start.tz
-        elif end is not None:
-            tz = end.tz
-
-        xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
-
-        values = np.array([x.value for x in xdr], dtype=np.int64)
+        raise ValueError(
+            "at least 'start' or 'end' should be specified if a 'period' is given."
+        )
 
-    return values, tz
+    with np.errstate(over="raise"):
+        # If the range is sufficiently large, np.arange may overflow
+        #  and incorrectly return an empty array if not caught.
+        try:
+            values = np.arange(b, e, stride, dtype=np.int64)
+        except FloatingPointError:
+            xdr = [b]
+            while xdr[-1] != e:
+                xdr.append(xdr[-1] + stride)
+            values = np.array(xdr[:-1], dtype=np.int64)
+    return values
 
 
 def _generate_range_overflow_safe(
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -48,7 +48,7 @@
 import pandas.core.common as com
 
 from pandas.tseries.frequencies import get_period_alias, to_offset
-from pandas.tseries.offsets import Day, Tick
+from pandas.tseries.offsets import Day, Tick, generate_range
 
 _midnight = time(0, 0)
 
@@ -370,33 +370,22 @@ def _generate_range(
         if end is not None:
             end = Timestamp(end)
 
-        if start is None and end is None:
-            if closed is not None:
-                raise ValueError(
-                    "Closed has to be None if not both of start and end are defined"
-                )
         if start is NaT or end is NaT:
             raise ValueError("Neither `start` nor `end` can be NaT")
 
         left_closed, right_closed = dtl.validate_endpoints(closed)
-
         start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize)
-
         tz = _infer_tz_from_endpoints(start, end, tz)
 
         if tz is not None:
             # Localize the start and end arguments
+            start_tz = None if start is None else start.tz
+            end_tz = None if end is None else end.tz
             start = _maybe_localize_point(
-                start,
-                getattr(start, "tz", None),
-                start,
-                freq,
-                tz,
-                ambiguous,
-                nonexistent,
+                start, start_tz, start, freq, tz, ambiguous, nonexistent
             )
             end = _maybe_localize_point(
-                end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent
+                end, end_tz, end, freq, tz, ambiguous, nonexistent
             )
         if freq is not None:
             # We break Day arithmetic (fixed 24 hour) here and opt for
@@ -408,7 +397,13 @@ def _generate_range(
                 if end is not None:
                     end = end.tz_localize(None)
 
-            values, _tz = generate_regular_range(start, end, periods, freq)
+            if isinstance(freq, Tick):
+                values = generate_regular_range(start, end, periods, freq)
+            else:
+                xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
+                values = np.array([x.value for x in xdr], dtype=np.int64)
+
+            _tz = start.tz if start is not None else end.tz
             index = cls._simple_new(values, freq=freq, dtype=tz_to_dtype(_tz))
 
             if tz is not None and index.tz is None:
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -33,6 +33,7 @@
 from pandas.core import nanops
 from pandas.core.algorithms import checked_add_with_arr
 from pandas.core.arrays import datetimelike as dtl
+from pandas.core.arrays._ranges import generate_regular_range
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.ops.common import unpack_zerodim_and_defer
@@ -255,16 +256,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
         if end is not None:
             end = Timedelta(end)
 
-        if start is None and end is None:
-            if closed is not None:
-                raise ValueError(
-                    "Closed has to be None if not both of start and end are defined"
-                )
-
         left_closed, right_closed = dtl.validate_endpoints(closed)
 
         if freq is not None:
-            index = _generate_regular_range(start, end, periods, freq)
+            index = generate_regular_range(start, end, periods, freq)
         else:
             index = np.linspace(start.value, end.value, periods).astype("i8")
             if len(index) >= 2:
@@ -1048,24 +1043,3 @@ def _validate_td64_dtype(dtype):
         raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")
 
     return dtype
-
-
-def _generate_regular_range(start, end, periods, offset):
-    stride = offset.nanos
-    if periods is None:
-        b = Timedelta(start).value
-        e = Timedelta(end).value
-        e += stride - e % stride
-    elif start is not None:
-        b = Timedelta(start).value
-        e = b + periods * stride
-    elif end is not None:
-        e = Timedelta(end).value + stride
-        b = e - periods * stride
-    else:
-        raise ValueError(
-            "at least 'start' or 'end' should be specified if a 'period' is given."
-        )
-
-    data = np.arange(b, e, stride, dtype=np.int64)
-    return data
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1499,9 +1499,12 @@ def _get_time_delta_bins(self, ax):
         end_stamps = labels + self.freq
         bins = ax.searchsorted(end_stamps, side="left")
 
-        # Addresses GH #10530
         if self.base > 0:
+            # GH #10530
             labels += type(self.freq)(self.base)
+        if self.loffset:
+            # GH #33498
+            labels += self.loffset
 
         return binner, bins, labels
 
diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from pandas import timedelta_range, to_timedelta
+from pandas import Timedelta, timedelta_range, to_timedelta
 import pandas._testing as tm
 
 from pandas.tseries.offsets import Day, Second
@@ -61,3 +61,21 @@ def test_errors(self):
         # too many params
         with pytest.raises(ValueError, match=msg):
             timedelta_range(start="0 days", end="5 days", periods=10, freq="H")
+
+    @pytest.mark.parametrize(
+        "start, end, freq, expected_periods",
+        [
+            ("1D", "10D", "2D", (10 - 1) // 2 + 1),
+            ("2D", "30D", "3D", (30 - 2) // 3 + 1),
+            ("2s", "50s", "5s", (50 - 2) // 5 + 1),
+            # tests that worked before GH 33498:
+            ("4D", "16D", "3D", (16 - 4) // 3 + 1),
+            ("8D", "16D", "40s", (16 * 3600 * 24 - 8 * 3600 * 24) // 40 + 1),
+        ],
+    )
+    def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_periods):
+        # GH 33498 only the cases where `(end % freq) == 0` used to fail
+        res = timedelta_range(start=start, end=end, freq=freq)
+        assert Timedelta(start) == res[0]
+        assert Timedelta(end) >= res[-1]
+        assert len(res) == expected_periods
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
@@ -10,7 +10,7 @@
 from pandas.core.groupby.grouper import Grouper
 from pandas.core.indexes.datetimes import date_range
 from pandas.core.indexes.period import PeriodIndex, period_range
-from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
+from pandas.core.indexes.timedeltas import timedelta_range
 from pandas.core.resample import _asfreq_compat
 
 # a fixture value can be overridden by the test parameter value. Note that the
@@ -182,7 +182,6 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti):
 @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0))
 @pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"])
 def test_resample_empty_dtypes(index, dtype, resample_method):
-
     # Empty series were sometimes causing a segfault (for the functions
     # with Cython bounds-checking disabled) or an IndexError.  We just run
     # them to ensure they no longer do.  (GH #10228)
@@ -215,13 +214,7 @@ def test_resample_loffset_arg_type(frame, create_index, arg):
     if isinstance(arg, list):
         expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
 
-    # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex
-    if isinstance(expected.index, TimedeltaIndex):
-        msg = "DataFrame are different"
-        with pytest.raises(AssertionError, match=msg):
-            tm.assert_frame_equal(result_agg, expected)
-    else:
-        tm.assert_frame_equal(result_agg, expected)
+    tm.assert_frame_equal(result_agg, expected)
 
 
 @all_ts
diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py
@@ -1,6 +1,7 @@
 from datetime import timedelta
 
 import numpy as np
+import pytest
 
 import pandas as pd
 from pandas import DataFrame, Series
@@ -114,14 +115,39 @@ def test_resample_timedelta_values():
     # check that timedelta dtype is preserved when NaT values are
     # introduced by the resampling
 
-    times = timedelta_range("1 day", "4 day", freq="4D")
+    times = timedelta_range("1 day", "6 day", freq="4D")
     df = DataFrame({"time": times}, index=times)
 
-    times2 = timedelta_range("1 day", "4 day", freq="2D")
+    times2 = timedelta_range("1 day", "6 day", freq="2D")
     exp = Series(times2, index=times2, name="time")
     exp.iloc[1] = pd.NaT
 
     res = df.resample("2D").first()["time"]
     tm.assert_series_equal(res, exp)
     res = df["time"].resample("2D").first()
     tm.assert_series_equal(res, exp)
+
+
+@pytest.mark.parametrize(
+    "start, end, freq, resample_freq",
+    [
+        ("8H", "21h59min50s", "10S", "3H"),  # GH 30353 example
+        ("3H", "22H", "1H", "5H"),
+        ("527D", "5006D", "3D", "10D"),
+        ("1D", "10D", "1D", "2D"),  # GH 13022 example
+        # tests that worked before GH 33498:
+        ("8H", "21h59min50s", "10S", "2H"),
+        ("0H", "21h59min50s", "10S", "3H"),
+        ("10D", "85D", "D", "2D"),
+    ],
+)
+def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
+    # GH 33498
+    # check that the timedelta bins does not contains an extra bin
+    idx = pd.timedelta_range(start=start, end=end, freq=freq)
+    s = pd.Series(np.arange(len(idx)), index=idx)
+    result = s.resample(resample_freq).min()
+    expected_index = pd.timedelta_range(freq=resample_freq, start=start, end=end)
+    tm.assert_index_equal(result.index, expected_index)
+    assert result.index.freq == expected_index.freq
+    assert not np.isnan(result[-1])