Skip to content

Commit 520e758

Browse files
ms7463Pingviinituutti
authored andcommitted
BUG/ENH - base argument no longer ignored in period resample (pandas-dev#23941)
1 parent 56f15d0 commit 520e758

File tree

4 files changed

+172
-6
lines changed

4 files changed

+172
-6
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ Other Enhancements
353353
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
354354
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
355355
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
356+
- :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`)
356357
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
357358
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
358359
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).

pandas/core/resample.py

+92-5
Original file line numberDiff line numberDiff line change
@@ -1389,9 +1389,10 @@ def _get_time_bins(self, ax):
13891389
data=[], freq=self.freq, name=ax.name)
13901390
return binner, [], labels
13911391

1392-
first, last = _get_range_edges(ax.min(), ax.max(), self.freq,
1393-
closed=self.closed,
1394-
base=self.base)
1392+
first, last = _get_timestamp_range_edges(ax.min(), ax.max(),
1393+
self.freq,
1394+
closed=self.closed,
1395+
base=self.base)
13951396
tz = ax.tz
13961397
# GH #12037
13971398
# use first/last directly instead of call replace() on them
@@ -1540,20 +1541,39 @@ def _get_period_bins(self, ax):
15401541
data=[], freq=self.freq, name=ax.name)
15411542
return binner, [], labels
15421543

1544+
freq_mult = self.freq.n
1545+
15431546
start = ax.min().asfreq(self.freq, how=self.convention)
15441547
end = ax.max().asfreq(self.freq, how='end')
1548+
bin_shift = 0
1549+
1550+
# GH 23882
1551+
if self.base:
1552+
# get base adjusted bin edge labels
1553+
p_start, end = _get_period_range_edges(start,
1554+
end,
1555+
self.freq,
1556+
closed=self.closed,
1557+
base=self.base)
1558+
1559+
# Get offset for bin edge (not label edge) adjustment
1560+
start_offset = (pd.Period(start, self.freq)
1561+
- pd.Period(p_start, self.freq))
1562+
bin_shift = start_offset.n % freq_mult
1563+
start = p_start
15451564

15461565
labels = binner = PeriodIndex(start=start, end=end,
15471566
freq=self.freq, name=ax.name)
15481567

15491568
i8 = memb.asi8
1550-
freq_mult = self.freq.n
15511569

15521570
# when upsampling to subperiods, we need to generate enough bins
15531571
expected_bins_count = len(binner) * freq_mult
15541572
i8_extend = expected_bins_count - (i8[-1] - i8[0])
15551573
rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
15561574
rng += freq_mult
1575+
# adjust bin edge indexes to account for base
1576+
rng -= bin_shift
15571577
bins = memb.searchsorted(rng, side='left')
15581578

15591579
if nat_count > 0:
@@ -1582,7 +1602,35 @@ def _take_new_index(obj, indexer, new_index, axis=0):
15821602
raise ValueError("'obj' should be either a Series or a DataFrame")
15831603

15841604

1585-
def _get_range_edges(first, last, offset, closed='left', base=0):
1605+
def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
1606+
"""
1607+
Adjust the `first` Timestamp to the preceeding Timestamp that resides on
1608+
the provided offset. Adjust the `last` Timestamp to the following
1609+
Timestamp that resides on the provided offset. Input Timestamps that
1610+
already reside on the offset will be adjusted depeding on the type of
1611+
offset and the `closed` parameter.
1612+
1613+
Parameters
1614+
----------
1615+
first : pd.Timestamp
1616+
The beginning Timestamp of the range to be adjusted.
1617+
last : pd.Timestamp
1618+
The ending Timestamp of the range to be adjusted.
1619+
offset : pd.DateOffset
1620+
The dateoffset to which the Timestamps will be adjusted.
1621+
closed : {'right', 'left'}, default None
1622+
Which side of bin interval is closed.
1623+
base : int, default 0
1624+
The "origin" of the adjusted Timestamps.
1625+
1626+
Returns
1627+
-------
1628+
A tuple of length 2, containing the adjusted pd.Timestamp objects.
1629+
"""
1630+
if not all(isinstance(obj, pd.Timestamp) for obj in [first, last]):
1631+
raise TypeError("'first' and 'last' must be instances of type "
1632+
"Timestamp")
1633+
15861634
if isinstance(offset, Tick):
15871635
is_day = isinstance(offset, Day)
15881636
day_nanos = delta_to_nanoseconds(timedelta(1))
@@ -1606,6 +1654,45 @@ def _get_range_edges(first, last, offset, closed='left', base=0):
16061654
return first, last
16071655

16081656

1657+
def _get_period_range_edges(first, last, offset, closed='left', base=0):
1658+
"""
1659+
Adjust the provided `first` and `last` Periods to the respective Period of
1660+
the given offset that encompasses them.
1661+
1662+
Parameters
1663+
----------
1664+
first : pd.Period
1665+
The beginning Period of the range to be adjusted.
1666+
last : pd.Period
1667+
The ending Period of the range to be adjusted.
1668+
offset : pd.DateOffset
1669+
The dateoffset to which the Periods will be adjusted.
1670+
closed : {'right', 'left'}, default None
1671+
Which side of bin interval is closed.
1672+
base : int, default 0
1673+
The "origin" of the adjusted Periods.
1674+
1675+
Returns
1676+
-------
1677+
A tuple of length 2, containing the adjusted pd.Period objects.
1678+
"""
1679+
if not all(isinstance(obj, pd.Period) for obj in [first, last]):
1680+
raise TypeError("'first' and 'last' must be instances of type Period")
1681+
1682+
# GH 23882
1683+
first = first.to_timestamp()
1684+
last = last.to_timestamp()
1685+
adjust_first = not offset.onOffset(first)
1686+
adjust_last = offset.onOffset(last)
1687+
1688+
first, last = _get_timestamp_range_edges(first, last, offset,
1689+
closed=closed, base=base)
1690+
1691+
first = (first + adjust_first * offset).to_period(offset)
1692+
last = (last - adjust_last * offset).to_period(offset)
1693+
return first, last
1694+
1695+
16091696
def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
16101697
# First and last offsets should be calculated from the start day to fix an
16111698
# error cause by resampling across multiple days when a one day period is

pandas/tests/resample/test_datetime_index.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.core.indexes.datetimes import date_range
1616
from pandas.core.indexes.period import Period, period_range
1717
from pandas.core.indexes.timedeltas import timedelta_range
18-
from pandas.core.resample import DatetimeIndex, TimeGrouper
18+
from pandas.core.resample import (
19+
DatetimeIndex, TimeGrouper, _get_timestamp_range_edges)
1920
import pandas.util.testing as tm
2021
from pandas.util.testing import (
2122
assert_almost_equal, assert_frame_equal, assert_series_equal)
@@ -1481,3 +1482,27 @@ def test_resample_equivalent_offsets(self, n1, freq1, n2, freq2, k):
14811482
result1 = s.resample(str(n1_) + freq1).mean()
14821483
result2 = s.resample(str(n2_) + freq2).mean()
14831484
assert_series_equal(result1, result2)
1485+
1486+
@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [
1487+
('19910905', '19920406', 'D', '19910905', '19920407'),
1488+
('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'),
1489+
('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
1490+
'19920406 07:00'),
1491+
('19910906', '19920406', 'M', '19910831', '19920430'),
1492+
('19910831', '19920430', 'M', '19910831', '19920531'),
1493+
('1991-08', '1992-04', 'M', '19910831', '19920531'),
1494+
])
1495+
def test_get_timestamp_range_edges(self, first, last, offset,
1496+
exp_first, exp_last):
1497+
first = pd.Period(first)
1498+
first = first.to_timestamp(first.freq)
1499+
last = pd.Period(last)
1500+
last = last.to_timestamp(last.freq)
1501+
1502+
exp_first = pd.Timestamp(exp_first, freq=offset)
1503+
exp_last = pd.Timestamp(exp_last, freq=offset)
1504+
1505+
offset = pd.tseries.frequencies.to_offset(offset)
1506+
result = _get_timestamp_range_edges(first, last, offset)
1507+
expected = (exp_first, exp_last)
1508+
assert result == expected

pandas/tests/resample/test_period_index.py

+53
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas import DataFrame, Series, Timestamp
1414
from pandas.core.indexes.datetimes import date_range
1515
from pandas.core.indexes.period import Period, PeriodIndex, period_range
16+
from pandas.core.resample import _get_period_range_edges
1617
import pandas.util.testing as tm
1718
from pandas.util.testing import (
1819
assert_almost_equal, assert_frame_equal, assert_series_equal)
@@ -701,3 +702,55 @@ def test_resample_with_only_nat(self):
701702
expected = DataFrame([], index=expected_index)
702703
result = frame.resample('1s').mean()
703704
assert_frame_equal(result, expected)
705+
706+
@pytest.mark.parametrize('start,end,start_freq,end_freq,base', [
707+
('19910905', '19910909 03:00', 'H', '24H', 10),
708+
('19910905', '19910909 12:00', 'H', '24H', 10),
709+
('19910905', '19910909 23:00', 'H', '24H', 10),
710+
('19910905 10:00', '19910909', 'H', '24H', 10),
711+
('19910905 10:00', '19910909 10:00', 'H', '24H', 10),
712+
('19910905', '19910909 10:00', 'H', '24H', 10),
713+
('19910905 12:00', '19910909', 'H', '24H', 10),
714+
('19910905 12:00', '19910909 03:00', 'H', '24H', 10),
715+
('19910905 12:00', '19910909 12:00', 'H', '24H', 10),
716+
('19910905 12:00', '19910909 12:00', 'H', '24H', 34),
717+
('19910905 12:00', '19910909 12:00', 'H', '17H', 10),
718+
('19910905 12:00', '19910909 12:00', 'H', '17H', 3),
719+
('19910905 12:00', '19910909 1:00', 'H', 'M', 3),
720+
('19910905', '19910913 06:00', '2H', '24H', 10),
721+
('19910905', '19910905 01:39', 'Min', '5Min', 3),
722+
('19910905', '19910905 03:18', '2Min', '5Min', 3),
723+
])
724+
def test_resample_with_non_zero_base(self, start, end, start_freq,
725+
end_freq, base):
726+
# GH 23882
727+
s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq))
728+
s = s + np.arange(len(s))
729+
result = s.resample(end_freq, base=base).mean()
730+
result = result.to_timestamp(end_freq)
731+
# to_timestamp casts 24H -> D
732+
result = result.asfreq(end_freq) if end_freq == '24H' else result
733+
expected = s.to_timestamp().resample(end_freq, base=base).mean()
734+
assert_series_equal(result, expected)
735+
736+
@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [
737+
('19910905', '19920406', 'D', '19910905', '19920406'),
738+
('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'),
739+
('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00',
740+
'19920406 06:00'),
741+
('19910906', '19920406', 'M', '1991-09', '1992-04'),
742+
('19910831', '19920430', 'M', '1991-08', '1992-04'),
743+
('1991-08', '1992-04', 'M', '1991-08', '1992-04'),
744+
])
745+
def test_get_period_range_edges(self, first, last, offset,
746+
exp_first, exp_last):
747+
first = pd.Period(first)
748+
last = pd.Period(last)
749+
750+
exp_first = pd.Period(exp_first, freq=offset)
751+
exp_last = pd.Period(exp_last, freq=offset)
752+
753+
offset = pd.tseries.frequencies.to_offset(offset)
754+
result = _get_period_range_edges(first, last, offset)
755+
expected = (exp_first, exp_last)
756+
assert result == expected

0 commit comments

Comments
 (0)