Skip to content

Commit 7d4a260

Browse files
winklerandjreback
authored andcommitted
BUG: Fix some PeriodIndex resampling issues (#16153)
closes #15944 xref #12884 closes #13083 closes #13224
1 parent fd336fb commit 7d4a260

File tree

3 files changed

+340
-172
lines changed

3 files changed

+340
-172
lines changed

doc/source/whatsnew/v0.21.0.txt

+76
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,82 @@ Other Enhancements
171171
Backwards incompatible API changes
172172
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
173173

174+
.. _whatsnew_0210.api_breaking.period_index_resampling:
175+
176+
``PeriodIndex`` resampling
177+
^^^^^^^^^^^^^^^^^^^^^^^^^^
178+
179+
In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`)
180+
181+
Previous Behavior:
182+
183+
.. code-block:: ipython
184+
185+
In [1]: pi = pd.period_range('2017-01', periods=12, freq='M')
186+
187+
In [2]: s = pd.Series(np.arange(12), index=pi)
188+
189+
In [3]: resampled = s.resample('2Q').mean()
190+
191+
In [4]: resampled
192+
Out[4]:
193+
2017-03-31 1.0
194+
2017-09-30 5.5
195+
2018-03-31 10.0
196+
Freq: 2Q-DEC, dtype: float64
197+
198+
In [5]: resampled.index
199+
Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC')
200+
201+
New Behavior:
202+
203+
.. ipython:: python
204+
205+
pi = pd.period_range('2017-01', periods=12, freq='M')
206+
207+
s = pd.Series(np.arange(12), index=pi)
208+
209+
resampled = s.resample('2Q').mean()
210+
211+
resampled
212+
213+
resampled.index
214+
215+
216+
Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior.
217+
218+
Previous Behavior:
219+
220+
.. code-block:: ipython
221+
222+
In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)
223+
224+
In [2]: s = pd.Series(np.arange(10), index=pi)
225+
226+
In [3]: s.resample('H').ohlc()
227+
Out[3]:
228+
2000-01-01 00:00 0.0
229+
...
230+
2000-01-10 23:00 NaN
231+
Freq: H, Length: 240, dtype: float64
232+
233+
In [4]: s.resample('M').ohlc()
234+
Out[4]:
235+
open high low close
236+
2000-01 0 9 0 9
237+
238+
New Behavior:
239+
240+
.. ipython:: python
241+
242+
pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10)
243+
244+
s = pd.Series(np.arange(10), index=pi)
245+
246+
s.resample('H').ohlc()
247+
248+
s.resample('M').ohlc()
249+
174250

175251
.. _whatsnew_0210.api_breaking.deps:
176252

pandas/core/resample.py

+74-58
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
1515
from pandas.core.indexes.timedeltas import TimedeltaIndex
1616
from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds
17-
from pandas.core.indexes.period import PeriodIndex, period_range
17+
from pandas.core.indexes.period import PeriodIndex
1818
import pandas.core.common as com
1919
import pandas.core.algorithms as algos
2020
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
@@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler):
834834
def _resampler_for_grouping(self):
835835
return PeriodIndexResamplerGroupby
836836

837+
def _get_binner_for_time(self):
838+
if self.kind == 'timestamp':
839+
return super(PeriodIndexResampler, self)._get_binner_for_time()
840+
return self.groupby._get_period_bins(self.ax)
841+
837842
def _convert_obj(self, obj):
838843
obj = super(PeriodIndexResampler, self)._convert_obj(obj)
839844

840-
offset = to_offset(self.freq)
841-
if offset.n > 1:
842-
if self.kind == 'period': # pragma: no cover
843-
print('Warning: multiple of frequency -> timestamps')
844-
845-
# Cannot have multiple of periods, convert to timestamp
845+
if self._from_selection:
846+
# see GH 14008, GH 12871
847+
msg = ("Resampling from level= or on= selection"
848+
" with a PeriodIndex is not currently supported,"
849+
" use .set_index(...) to explicitly set index")
850+
raise NotImplementedError(msg)
851+
852+
if self.loffset is not None:
853+
# Cannot apply loffset/timedelta to PeriodIndex -> convert to
854+
# timestamps
846855
self.kind = 'timestamp'
847856

848857
# convert to timestamp
849-
if not (self.kind is None or self.kind == 'period'):
850-
if self._from_selection:
851-
# see GH 14008, GH 12871
852-
msg = ("Resampling from level= or on= selection"
853-
" with a PeriodIndex is not currently supported,"
854-
" use .set_index(...) to explicitly set index")
855-
raise NotImplementedError(msg)
856-
else:
857-
obj = obj.to_timestamp(how=self.convention)
858+
if self.kind == 'timestamp':
859+
obj = obj.to_timestamp(how=self.convention)
858860

859861
return obj
860862

861-
def aggregate(self, arg, *args, **kwargs):
862-
result, how = self._aggregate(arg, *args, **kwargs)
863-
if result is None:
864-
result = self._downsample(arg, *args, **kwargs)
865-
866-
result = self._apply_loffset(result)
867-
return result
868-
869-
agg = aggregate
870-
871-
def _get_new_index(self):
872-
""" return our new index """
873-
ax = self.ax
874-
875-
if len(ax) == 0:
876-
values = []
877-
else:
878-
start = ax[0].asfreq(self.freq, how=self.convention)
879-
end = ax[-1].asfreq(self.freq, how='end')
880-
values = period_range(start, end, freq=self.freq).asi8
881-
882-
return ax._shallow_copy(values, freq=self.freq)
883-
884863
def _downsample(self, how, **kwargs):
885864
"""
886865
Downsample the cython defined function
@@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs):
898877
how = self._is_cython_func(how) or how
899878
ax = self.ax
900879

901-
new_index = self._get_new_index()
902-
903-
# Start vs. end of period
904-
memb = ax.asfreq(self.freq, how=self.convention)
905-
906880
if is_subperiod(ax.freq, self.freq):
907881
# Downsampling
908-
if len(new_index) == 0:
909-
bins = []
910-
else:
911-
i8 = memb.asi8
912-
rng = np.arange(i8[0], i8[-1] + 1)
913-
bins = memb.searchsorted(rng, side='right')
914-
grouper = BinGrouper(bins, new_index)
915-
return self._groupby_and_aggregate(how, grouper=grouper)
882+
return self._groupby_and_aggregate(how, grouper=self.grouper)
916883
elif is_superperiod(ax.freq, self.freq):
884+
if how == 'ohlc':
885+
# GH #13083
886+
# upsampling to subperiods is handled as an asfreq, which works
887+
# for pure aggregating/reducing methods
888+
# OHLC reduces along the time dimension, but creates multiple
889+
# values for each period -> handle by _groupby_and_aggregate()
890+
return self._groupby_and_aggregate(how, grouper=self.grouper)
917891
return self.asfreq()
918892
elif ax.freq == self.freq:
919893
return self.asfreq()
@@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None):
936910
.fillna
937911
938912
"""
939-
if self._from_selection:
940-
raise ValueError("Upsampling from level= or on= selection"
941-
" is not supported, use .set_index(...)"
942-
" to explicitly set index to"
943-
" datetime-like")
913+
944914
# we may need to actually resample as if we are timestamps
945915
if self.kind == 'timestamp':
946916
return super(PeriodIndexResampler, self)._upsample(
947917
method, limit=limit, fill_value=fill_value)
948918

919+
self._set_binner()
949920
ax = self.ax
950921
obj = self.obj
951-
new_index = self._get_new_index()
922+
new_index = self.binner
952923

953924
# Start vs. end of period
954925
memb = ax.asfreq(self.freq, how=self.convention)
@@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax):
12931264

12941265
return binner, bins, labels
12951266

1267+
def _get_period_bins(self, ax):
1268+
if not isinstance(ax, PeriodIndex):
1269+
raise TypeError('axis must be a PeriodIndex, but got '
1270+
'an instance of %r' % type(ax).__name__)
1271+
1272+
memb = ax.asfreq(self.freq, how=self.convention)
1273+
1274+
# NaT handling as in pandas._lib.lib.generate_bins_dt64()
1275+
nat_count = 0
1276+
if memb.hasnans:
1277+
nat_count = np.sum(memb._isnan)
1278+
memb = memb[~memb._isnan]
1279+
1280+
# if index contains no valid (non-NaT) values, return empty index
1281+
if not len(memb):
1282+
binner = labels = PeriodIndex(
1283+
data=[], freq=self.freq, name=ax.name)
1284+
return binner, [], labels
1285+
1286+
start = ax.min().asfreq(self.freq, how=self.convention)
1287+
end = ax.max().asfreq(self.freq, how='end')
1288+
1289+
labels = binner = PeriodIndex(start=start, end=end,
1290+
freq=self.freq, name=ax.name)
1291+
1292+
i8 = memb.asi8
1293+
freq_mult = self.freq.n
1294+
1295+
# when upsampling to subperiods, we need to generate enough bins
1296+
expected_bins_count = len(binner) * freq_mult
1297+
i8_extend = expected_bins_count - (i8[-1] - i8[0])
1298+
rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
1299+
rng += freq_mult
1300+
bins = memb.searchsorted(rng, side='left')
1301+
1302+
if nat_count > 0:
1303+
# NaT handling as in pandas._lib.lib.generate_bins_dt64()
1304+
# shift bins by the number of NaT
1305+
bins += nat_count
1306+
bins = np.insert(bins, 0, nat_count)
1307+
binner = binner.insert(0, tslib.NaT)
1308+
labels = labels.insert(0, tslib.NaT)
1309+
1310+
return binner, bins, labels
1311+
12961312

12971313
def _take_new_index(obj, indexer, new_index, axis=0):
12981314
from pandas.core.api import Series, DataFrame

0 commit comments

Comments
 (0)