From e6a6064059ba498e72aa2be090f6e7fdcb6b2e55 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 9 Feb 2023 17:38:33 +0000 Subject: [PATCH 1/4] fix resample non-nano out-of-nano-bounds --- pandas/core/resample.py | 57 ++++++++++++-------- pandas/tests/resample/test_datetime_index.py | 27 +++++++++- 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 20495a50818d8..b591b12a7ba3a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1731,6 +1731,7 @@ def _get_time_bins(self, ax: DatetimeIndex): ax.min(), ax.max(), self.freq, + unit=ax.unit, closed=self.closed, origin=self.origin, offset=self.offset, @@ -1750,7 +1751,8 @@ def _get_time_bins(self, ax: DatetimeIndex): name=ax.name, ambiguous=True, nonexistent="shift_forward", - ).as_unit(ax.unit) + unit=ax.unit, + ) ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) @@ -1960,6 +1962,7 @@ def _get_timestamp_range_edges( first: Timestamp, last: Timestamp, freq: BaseOffset, + unit: str, closed: Literal["right", "left"] = "left", origin: TimeGrouperOrigin = "start_day", offset: Timedelta | None = None, @@ -2015,7 +2018,7 @@ def _get_timestamp_range_edges( origin = origin.tz_localize(None) first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, origin=origin, offset=offset + first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit ) if isinstance(freq, Day): first = first.tz_localize(index_tz) @@ -2082,7 +2085,7 @@ def _get_period_range_edges( adjust_last = freq.is_on_offset(last_ts) first_ts, last_ts = _get_timestamp_range_edges( - first_ts, last_ts, freq, closed=closed, origin=origin, offset=offset + first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset ) first = (first_ts + int(adjust_first) * freq).to_period(freq) @@ -2115,32 +2118,42 @@ def _adjust_dates_anchored( closed: Literal["right", "left"] = "right", origin: TimeGrouperOrigin = "start_day", offset: Timedelta | None = None, + unit="ns", ) -> tuple[Timestamp, Timestamp]: # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. See GH 8683 # To handle frequencies that are not multiple or divisible by a day we let # the possibility to define a fixed origin timestamp. See GH 31809 - first = first.as_unit("ns") - last = last.as_unit("ns") + first = first.as_unit(unit) + last = last.as_unit(unit) if offset is not None: - offset = offset.as_unit("ns") - - origin_nanos = 0 # origin == "epoch" + offset = offset.as_unit(unit) + + # TODO is there anything which can be reused here? + freq_value = freq.nanos + if unit == "us": + freq_value = freq_value // 1_000 + elif unit == "ms": + freq_value = freq_value // 1_000_000 + elif unit == "s": + freq_value = freq_value // 1_000_000_000 + + origin_timestamp = 0 # origin == "epoch" if origin == "start_day": - origin_nanos = first.normalize()._value + origin_timestamp = first.normalize()._value elif origin == "start": - origin_nanos = first._value + origin_timestamp = first._value elif isinstance(origin, Timestamp): - origin_nanos = origin.as_unit("ns")._value + origin_timestamp = origin.as_unit(unit)._value elif origin in ["end", "end_day"]: origin_last = last if origin == "end" else last.ceil("D") - sub_freq_times = (origin_last._value - first._value) // freq.nanos + sub_freq_times = (origin_last._value - first._value) // freq_value if closed == "left": sub_freq_times += 1 first = origin_last - sub_freq_times * freq - origin_nanos = first._value - origin_nanos += offset._value if offset else 0 + origin_timestamp = first._value + origin_timestamp += offset._value if offset else 0 # GH 10117 & GH 19375. If first and last contain timezone information, # Perform the calculation in UTC in order to avoid localizing on an @@ -2152,19 +2165,19 @@ def _adjust_dates_anchored( if last_tzinfo is not None: last = last.tz_convert("UTC") - foffset = (first._value - origin_nanos) % freq.nanos - loffset = (last._value - origin_nanos) % freq.nanos + foffset = (first._value - origin_timestamp) % freq_value + loffset = (last._value - origin_timestamp) % freq_value if closed == "right": if foffset > 0: # roll back fresult_int = first._value - foffset else: - fresult_int = first._value - freq.nanos + fresult_int = first._value - freq_value if loffset > 0: # roll forward - lresult_int = last._value + (freq.nanos - loffset) + lresult_int = last._value + (freq_value - loffset) else: # already the end of the road lresult_int = last._value @@ -2177,11 +2190,11 @@ def _adjust_dates_anchored( if loffset > 0: # roll forward - lresult_int = last._value + (freq.nanos - loffset) + lresult_int = last._value + (freq_value - loffset) else: - lresult_int = last._value + freq.nanos - fresult = Timestamp(fresult_int) - lresult = Timestamp(lresult_int) + lresult_int = last._value + freq_value + fresult = Timestamp(fresult_int, unit=unit) + lresult = Timestamp(lresult_int, unit=unit) if first_tzinfo is not None: fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) if last_tzinfo is not None: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index d18db6ab5f643..13041a81dadcf 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1838,7 +1838,7 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit) exp_last = Timestamp(exp_last) freq = pd.tseries.frequencies.to_offset(freq) - result = _get_timestamp_range_edges(first, last, freq) + result = _get_timestamp_range_edges(first, last, freq, unit="ns") expected = (exp_first, exp_last) assert result == expected @@ -1949,3 +1949,28 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit): ), ) tm.assert_frame_equal(result, expected) + + +def test_long_rule_non_nano(): + # https://github.com/pandas-dev/pandas/issues/51024 + idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100Y") + ser = Series([1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5], index=idx) + result = ser.resample("200Y").mean() + expected_idx = DatetimeIndex( + np.array( + [ + "0300-12-31", + "0500-12-31", + "0700-12-31", + "0900-12-31", + "1100-12-31", + "1300-12-31", + "1500-12-31", + "1700-12-31", + "1900-12-31", + ] + ).astype("datetime64[s]"), + freq="200A-DEC", + ) + expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx) + tm.assert_series_equal(result, expected) From d7f45ca25a0bfc593a8745ec4152b7ea53547961 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 9 Feb 2023 18:03:43 +0000 Subject: [PATCH 2/4] use periods_per_second and abbrev_to_npy_unit --- pandas/core/resample.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b591b12a7ba3a..90f2d2e63d6b2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -23,8 +23,10 @@ Period, Timedelta, Timestamp, + periods_per_second, to_offset, ) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas._typing import ( AnyArrayLike, Axis, @@ -2130,14 +2132,9 @@ def _adjust_dates_anchored( if offset is not None: offset = offset.as_unit(unit) - # TODO is there anything which can be reused here? - freq_value = freq.nanos - if unit == "us": - freq_value = freq_value // 1_000 - elif unit == "ms": - freq_value = freq_value // 1_000_000 - elif unit == "s": - freq_value = freq_value // 1_000_000_000 + freq_value = freq.nanos // ( + 1_000_000_000 // periods_per_second(abbrev_to_npy_unit(unit)) + ) origin_timestamp = 0 # origin == "epoch" if origin == "start_day": From a4008875ab2a77a327b1d1415cdd291e6b5fc74f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 10 Feb 2023 08:54:43 +0000 Subject: [PATCH 3/4] autotyping --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 90f2d2e63d6b2..3405d5181b405 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2120,7 +2120,7 @@ def _adjust_dates_anchored( closed: Literal["right", "left"] = "right", origin: TimeGrouperOrigin = "start_day", offset: Timedelta | None = None, - unit="ns", + unit: str = "ns", ) -> tuple[Timestamp, Timestamp]: # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is From 7e5b09661605277e3eb54660cffd55f9563611f2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 12 Feb 2023 21:26:16 +0000 Subject: [PATCH 4/4] require without freq.nanos --- pandas/core/resample.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ebd12225e199a..96982f8727188 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -23,10 +23,8 @@ Period, Timedelta, Timestamp, - periods_per_second, to_offset, ) -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas._typing import ( AnyArrayLike, Axis, @@ -2132,9 +2130,7 @@ def _adjust_dates_anchored( if offset is not None: offset = offset.as_unit(unit) - freq_value = freq.nanos // ( - 1_000_000_000 // periods_per_second(abbrev_to_npy_unit(unit)) - ) + freq_value = Timedelta(freq).as_unit(unit)._value origin_timestamp = 0 # origin == "epoch" if origin == "start_day":