Skip to content

BUG: can't resample with non-nano dateindex, out-of-nanosecond-bounds #51274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 27 additions & 21 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,7 @@ def _get_time_bins(self, ax: DatetimeIndex):
ax.min(),
ax.max(),
self.freq,
unit=ax.unit,
closed=self.closed,
origin=self.origin,
offset=self.offset,
Expand All @@ -1750,7 +1751,8 @@ def _get_time_bins(self, ax: DatetimeIndex):
name=ax.name,
ambiguous=True,
nonexistent="shift_forward",
).as_unit(ax.unit)
unit=ax.unit,
)

ax_values = ax.asi8
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
Expand Down Expand Up @@ -1960,6 +1962,7 @@ def _get_timestamp_range_edges(
first: Timestamp,
last: Timestamp,
freq: BaseOffset,
unit: str,
closed: Literal["right", "left"] = "left",
origin: TimeGrouperOrigin = "start_day",
offset: Timedelta | None = None,
Expand Down Expand Up @@ -2015,7 +2018,7 @@ def _get_timestamp_range_edges(
origin = origin.tz_localize(None)

first, last = _adjust_dates_anchored(
first, last, freq, closed=closed, origin=origin, offset=offset
first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit
)
if isinstance(freq, Day):
first = first.tz_localize(index_tz)
Expand Down Expand Up @@ -2082,7 +2085,7 @@ def _get_period_range_edges(
adjust_last = freq.is_on_offset(last_ts)

first_ts, last_ts = _get_timestamp_range_edges(
first_ts, last_ts, freq, closed=closed, origin=origin, offset=offset
first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
)

first = (first_ts + int(adjust_first) * freq).to_period(freq)
Expand Down Expand Up @@ -2115,32 +2118,35 @@ def _adjust_dates_anchored(
closed: Literal["right", "left"] = "right",
origin: TimeGrouperOrigin = "start_day",
offset: Timedelta | None = None,
unit: str = "ns",
) -> tuple[Timestamp, Timestamp]:
# First and last offsets should be calculated from the start day to fix an
# error cause by resampling across multiple days when a one day period is
# not a multiple of the frequency. See GH 8683
# To handle frequencies that are not multiple or divisible by a day we let
# the possibility to define a fixed origin timestamp. See GH 31809
first = first.as_unit("ns")
last = last.as_unit("ns")
first = first.as_unit(unit)
last = last.as_unit(unit)
if offset is not None:
offset = offset.as_unit("ns")
offset = offset.as_unit(unit)

freq_value = Timedelta(freq).as_unit(unit)._value

origin_nanos = 0 # origin == "epoch"
origin_timestamp = 0 # origin == "epoch"
if origin == "start_day":
origin_nanos = first.normalize()._value
origin_timestamp = first.normalize()._value
elif origin == "start":
origin_nanos = first._value
origin_timestamp = first._value
elif isinstance(origin, Timestamp):
origin_nanos = origin.as_unit("ns")._value
origin_timestamp = origin.as_unit(unit)._value
elif origin in ["end", "end_day"]:
origin_last = last if origin == "end" else last.ceil("D")
sub_freq_times = (origin_last._value - first._value) // freq.nanos
sub_freq_times = (origin_last._value - first._value) // freq_value
if closed == "left":
sub_freq_times += 1
first = origin_last - sub_freq_times * freq
origin_nanos = first._value
origin_nanos += offset._value if offset else 0
origin_timestamp = first._value
origin_timestamp += offset._value if offset else 0

# GH 10117 & GH 19375. If first and last contain timezone information,
# Perform the calculation in UTC in order to avoid localizing on an
Expand All @@ -2152,19 +2158,19 @@ def _adjust_dates_anchored(
if last_tzinfo is not None:
last = last.tz_convert("UTC")

foffset = (first._value - origin_nanos) % freq.nanos
loffset = (last._value - origin_nanos) % freq.nanos
foffset = (first._value - origin_timestamp) % freq_value
loffset = (last._value - origin_timestamp) % freq_value

if closed == "right":
if foffset > 0:
# roll back
fresult_int = first._value - foffset
else:
fresult_int = first._value - freq.nanos
fresult_int = first._value - freq_value

if loffset > 0:
# roll forward
lresult_int = last._value + (freq.nanos - loffset)
lresult_int = last._value + (freq_value - loffset)
else:
# already the end of the road
lresult_int = last._value
Expand All @@ -2177,11 +2183,11 @@ def _adjust_dates_anchored(

if loffset > 0:
# roll forward
lresult_int = last._value + (freq.nanos - loffset)
lresult_int = last._value + (freq_value - loffset)
else:
lresult_int = last._value + freq.nanos
fresult = Timestamp(fresult_int)
lresult = Timestamp(lresult_int)
lresult_int = last._value + freq_value
fresult = Timestamp(fresult_int, unit=unit)
lresult = Timestamp(lresult_int, unit=unit)
if first_tzinfo is not None:
fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
if last_tzinfo is not None:
Expand Down
27 changes: 26 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1838,7 +1838,7 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit)
exp_last = Timestamp(exp_last)

freq = pd.tseries.frequencies.to_offset(freq)
result = _get_timestamp_range_edges(first, last, freq)
result = _get_timestamp_range_edges(first, last, freq, unit="ns")
expected = (exp_first, exp_last)
assert result == expected

Expand Down Expand Up @@ -1949,3 +1949,28 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit):
),
)
tm.assert_frame_equal(result, expected)


def test_long_rule_non_nano():
# https://github.com/pandas-dev/pandas/issues/51024
idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100Y")
ser = Series([1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5], index=idx)
result = ser.resample("200Y").mean()
expected_idx = DatetimeIndex(
np.array(
[
"0300-12-31",
"0500-12-31",
"0700-12-31",
"0900-12-31",
"1100-12-31",
"1300-12-31",
"1500-12-31",
"1700-12-31",
"1900-12-31",
]
).astype("datetime64[s]"),
freq="200A-DEC",
)
expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx)
tm.assert_series_equal(result, expected)