Skip to content

BUG: can't resample with non-nano dateindex, out-of-nanosecond-bounds #51274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 13, 2023
Merged
57 changes: 35 additions & 22 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,7 @@ def _get_time_bins(self, ax: DatetimeIndex):
ax.min(),
ax.max(),
self.freq,
unit=ax.unit,
closed=self.closed,
origin=self.origin,
offset=self.offset,
Expand All @@ -1750,7 +1751,8 @@ def _get_time_bins(self, ax: DatetimeIndex):
name=ax.name,
ambiguous=True,
nonexistent="shift_forward",
).as_unit(ax.unit)
unit=ax.unit,
)

ax_values = ax.asi8
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
Expand Down Expand Up @@ -1960,6 +1962,7 @@ def _get_timestamp_range_edges(
first: Timestamp,
last: Timestamp,
freq: BaseOffset,
unit: str,
closed: Literal["right", "left"] = "left",
origin: TimeGrouperOrigin = "start_day",
offset: Timedelta | None = None,
Expand Down Expand Up @@ -2015,7 +2018,7 @@ def _get_timestamp_range_edges(
origin = origin.tz_localize(None)

first, last = _adjust_dates_anchored(
first, last, freq, closed=closed, origin=origin, offset=offset
first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit
)
if isinstance(freq, Day):
first = first.tz_localize(index_tz)
Expand Down Expand Up @@ -2082,7 +2085,7 @@ def _get_period_range_edges(
adjust_last = freq.is_on_offset(last_ts)

first_ts, last_ts = _get_timestamp_range_edges(
first_ts, last_ts, freq, closed=closed, origin=origin, offset=offset
first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
)

first = (first_ts + int(adjust_first) * freq).to_period(freq)
Expand Down Expand Up @@ -2115,32 +2118,42 @@ def _adjust_dates_anchored(
closed: Literal["right", "left"] = "right",
origin: TimeGrouperOrigin = "start_day",
offset: Timedelta | None = None,
unit="ns",
) -> tuple[Timestamp, Timestamp]:
# First and last offsets should be calculated from the start day to fix an
# error cause by resampling across multiple days when a one day period is
# not a multiple of the frequency. See GH 8683
# To handle frequencies that are not multiple or divisible by a day we let
# the possibility to define a fixed origin timestamp. See GH 31809
first = first.as_unit("ns")
last = last.as_unit("ns")
first = first.as_unit(unit)
last = last.as_unit(unit)
if offset is not None:
offset = offset.as_unit("ns")

origin_nanos = 0 # origin == "epoch"
offset = offset.as_unit(unit)

# TODO is there anything which can be reused here?
freq_value = freq.nanos
if unit == "us":
freq_value = freq_value // 1_000
elif unit == "ms":
freq_value = freq_value // 1_000_000
elif unit == "s":
freq_value = freq_value // 1_000_000_000
Copy link
Member Author

@MarcoGorelli MarcoGorelli Feb 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel is there any existing function which can be reused here?

there is periods_per_second, but that takes npydatetime_unit rather than str (is there a function to convert between them?)

I didn't find one, but I'll look more carefully later

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

abbrev_to_npy_unit

in tzconversion there are two places where we do approximately this, one of them has a comment to de-duplicate


origin_timestamp = 0 # origin == "epoch"
if origin == "start_day":
origin_nanos = first.normalize()._value
origin_timestamp = first.normalize()._value
elif origin == "start":
origin_nanos = first._value
origin_timestamp = first._value
elif isinstance(origin, Timestamp):
origin_nanos = origin.as_unit("ns")._value
origin_timestamp = origin.as_unit(unit)._value
elif origin in ["end", "end_day"]:
origin_last = last if origin == "end" else last.ceil("D")
sub_freq_times = (origin_last._value - first._value) // freq.nanos
sub_freq_times = (origin_last._value - first._value) // freq_value
if closed == "left":
sub_freq_times += 1
first = origin_last - sub_freq_times * freq
origin_nanos = first._value
origin_nanos += offset._value if offset else 0
origin_timestamp = first._value
origin_timestamp += offset._value if offset else 0

# GH 10117 & GH 19375. If first and last contain timezone information,
# Perform the calculation in UTC in order to avoid localizing on an
Expand All @@ -2152,19 +2165,19 @@ def _adjust_dates_anchored(
if last_tzinfo is not None:
last = last.tz_convert("UTC")

foffset = (first._value - origin_nanos) % freq.nanos
loffset = (last._value - origin_nanos) % freq.nanos
foffset = (first._value - origin_timestamp) % freq_value
loffset = (last._value - origin_timestamp) % freq_value

if closed == "right":
if foffset > 0:
# roll back
fresult_int = first._value - foffset
else:
fresult_int = first._value - freq.nanos
fresult_int = first._value - freq_value

if loffset > 0:
# roll forward
lresult_int = last._value + (freq.nanos - loffset)
lresult_int = last._value + (freq_value - loffset)
else:
# already the end of the road
lresult_int = last._value
Expand All @@ -2177,11 +2190,11 @@ def _adjust_dates_anchored(

if loffset > 0:
# roll forward
lresult_int = last._value + (freq.nanos - loffset)
lresult_int = last._value + (freq_value - loffset)
else:
lresult_int = last._value + freq.nanos
fresult = Timestamp(fresult_int)
lresult = Timestamp(lresult_int)
lresult_int = last._value + freq_value
fresult = Timestamp(fresult_int, unit=unit)
lresult = Timestamp(lresult_int, unit=unit)
if first_tzinfo is not None:
fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
if last_tzinfo is not None:
Expand Down
27 changes: 26 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1838,7 +1838,7 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit)
exp_last = Timestamp(exp_last)

freq = pd.tseries.frequencies.to_offset(freq)
result = _get_timestamp_range_edges(first, last, freq)
result = _get_timestamp_range_edges(first, last, freq, unit="ns")
expected = (exp_first, exp_last)
assert result == expected

Expand Down Expand Up @@ -1949,3 +1949,28 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit):
),
)
tm.assert_frame_equal(result, expected)


def test_long_rule_non_nano():
# https://github.com/pandas-dev/pandas/issues/51024
idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100Y")
ser = Series([1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5], index=idx)
result = ser.resample("200Y").mean()
expected_idx = DatetimeIndex(
np.array(
[
"0300-12-31",
"0500-12-31",
"0700-12-31",
"0900-12-31",
"1100-12-31",
"1300-12-31",
"1500-12-31",
"1700-12-31",
"1900-12-31",
]
).astype("datetime64[s]"),
freq="200A-DEC",
)
expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx)
tm.assert_series_equal(result, expected)