-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix some PeriodIndex resampling issues #16153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ca7b6f2
c27f430
390e16e
23566c2
a82879d
4b1c740
73c0990
fa6c1d3
7ea04e9
82a8275
432c623
c8814fb
486ad67
ad8519f
39fc7e2
efcad5b
41401d4
398a684
8358c41
6084e0c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
from pandas.core.indexes.datetimes import DatetimeIndex, date_range | ||
from pandas.core.indexes.timedeltas import TimedeltaIndex | ||
from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds | ||
from pandas.core.indexes.period import PeriodIndex, period_range | ||
from pandas.core.indexes.period import PeriodIndex | ||
import pandas.core.common as com | ||
import pandas.core.algorithms as algos | ||
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries | ||
|
@@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler): | |
def _resampler_for_grouping(self): | ||
return PeriodIndexResamplerGroupby | ||
|
||
def _get_binner_for_time(self): | ||
if self.kind == 'timestamp': | ||
return super(PeriodIndexResampler, self)._get_binner_for_time() | ||
return self.groupby._get_period_bins(self.ax) | ||
|
||
def _convert_obj(self, obj): | ||
obj = super(PeriodIndexResampler, self)._convert_obj(obj) | ||
|
||
offset = to_offset(self.freq) | ||
if offset.n > 1: | ||
if self.kind == 'period': # pragma: no cover | ||
print('Warning: multiple of frequency -> timestamps') | ||
|
||
# Cannot have multiple of periods, convert to timestamp | ||
if self._from_selection: | ||
# see GH 14008, GH 12871 | ||
msg = ("Resampling from level= or on= selection" | ||
" with a PeriodIndex is not currently supported," | ||
" use .set_index(...) to explicitly set index") | ||
raise NotImplementedError(msg) | ||
|
||
if self.loffset is not None: | ||
# Cannot apply loffset/timedelta to PeriodIndex -> convert to | ||
# timestamps | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we show a warning for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should, given that we kind of did that before (it was |
||
self.kind = 'timestamp' | ||
|
||
# convert to timestamp | ||
if not (self.kind is None or self.kind == 'period'): | ||
if self._from_selection: | ||
# see GH 14008, GH 12871 | ||
msg = ("Resampling from level= or on= selection" | ||
" with a PeriodIndex is not currently supported," | ||
" use .set_index(...) to explicitly set index") | ||
raise NotImplementedError(msg) | ||
else: | ||
obj = obj.to_timestamp(how=self.convention) | ||
if self.kind == 'timestamp': | ||
obj = obj.to_timestamp(how=self.convention) | ||
|
||
return obj | ||
|
||
def aggregate(self, arg, *args, **kwargs): | ||
result, how = self._aggregate(arg, *args, **kwargs) | ||
if result is None: | ||
result = self._downsample(arg, *args, **kwargs) | ||
|
||
result = self._apply_loffset(result) | ||
return result | ||
|
||
agg = aggregate | ||
|
||
def _get_new_index(self): | ||
""" return our new index """ | ||
ax = self.ax | ||
|
||
if len(ax) == 0: | ||
values = [] | ||
else: | ||
start = ax[0].asfreq(self.freq, how=self.convention) | ||
end = ax[-1].asfreq(self.freq, how='end') | ||
values = period_range(start, end, freq=self.freq).asi8 | ||
|
||
return ax._shallow_copy(values, freq=self.freq) | ||
|
||
def _downsample(self, how, **kwargs): | ||
""" | ||
Downsample the cython defined function | ||
|
@@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs): | |
how = self._is_cython_func(how) or how | ||
ax = self.ax | ||
|
||
new_index = self._get_new_index() | ||
|
||
# Start vs. end of period | ||
memb = ax.asfreq(self.freq, how=self.convention) | ||
|
||
if is_subperiod(ax.freq, self.freq): | ||
# Downsampling | ||
if len(new_index) == 0: | ||
bins = [] | ||
else: | ||
i8 = memb.asi8 | ||
rng = np.arange(i8[0], i8[-1] + 1) | ||
bins = memb.searchsorted(rng, side='right') | ||
grouper = BinGrouper(bins, new_index) | ||
return self._groupby_and_aggregate(how, grouper=grouper) | ||
return self._groupby_and_aggregate(how, grouper=self.grouper) | ||
elif is_superperiod(ax.freq, self.freq): | ||
if how == 'ohlc': | ||
# GH #13083 | ||
# upsampling to subperiods is handled as an asfreq, which works | ||
# for pure aggregating/reducing methods | ||
# OHLC reduces along the time dimension, but creates multiple | ||
# values for each period -> handle by _groupby_and_aggregate() | ||
return self._groupby_and_aggregate(how, grouper=self.grouper) | ||
return self.asfreq() | ||
elif ax.freq == self.freq: | ||
return self.asfreq() | ||
|
@@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None): | |
.fillna | ||
|
||
""" | ||
if self._from_selection: | ||
raise ValueError("Upsampling from level= or on= selection" | ||
" is not supported, use .set_index(...)" | ||
" to explicitly set index to" | ||
" datetime-like") | ||
|
||
# we may need to actually resample as if we are timestamps | ||
if self.kind == 'timestamp': | ||
return super(PeriodIndexResampler, self)._upsample( | ||
method, limit=limit, fill_value=fill_value) | ||
|
||
self._set_binner() | ||
ax = self.ax | ||
obj = self.obj | ||
new_index = self._get_new_index() | ||
new_index = self.binner | ||
|
||
# Start vs. end of period | ||
memb = ax.asfreq(self.freq, how=self.convention) | ||
|
@@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax): | |
|
||
return binner, bins, labels | ||
|
||
def _get_period_bins(self, ax): | ||
if not isinstance(ax, PeriodIndex): | ||
raise TypeError('axis must be a PeriodIndex, but got ' | ||
'an instance of %r' % type(ax).__name__) | ||
|
||
memb = ax.asfreq(self.freq, how=self.convention) | ||
|
||
# NaT handling as in pandas._lib.lib.generate_bins_dt64() | ||
nat_count = 0 | ||
if memb.hasnans: | ||
nat_count = np.sum(memb._isnan) | ||
memb = memb[~memb._isnan] | ||
|
||
# if index contains no valid (non-NaT) values, return empty index | ||
if not len(memb): | ||
binner = labels = PeriodIndex( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I left this, ok for now. |
||
data=[], freq=self.freq, name=ax.name) | ||
return binner, [], labels | ||
|
||
start = ax.min().asfreq(self.freq, how=self.convention) | ||
end = ax.max().asfreq(self.freq, how='end') | ||
|
||
labels = binner = PeriodIndex(start=start, end=end, | ||
freq=self.freq, name=ax.name) | ||
|
||
i8 = memb.asi8 | ||
freq_mult = self.freq.n | ||
|
||
# when upsampling to subperiods, we need to generate enough bins | ||
expected_bins_count = len(binner) * freq_mult | ||
i8_extend = expected_bins_count - (i8[-1] - i8[0]) | ||
rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) | ||
rng += freq_mult | ||
bins = memb.searchsorted(rng, side='left') | ||
|
||
if nat_count > 0: | ||
# NaT handling as in pandas._lib.lib.generate_bins_dt64() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this path tested sufficiently, e.g. 0, 1, 2 NaT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added a test case for consecutive NaTs in the index (1cad7fa) Should be sufficiently tested, cases covered:
Any ideas for more exhaustive test cases? |
||
# shift bins by the number of NaT | ||
bins += nat_count | ||
bins = np.insert(bins, 0, nat_count) | ||
binner = binner.insert(0, tslib.NaT) | ||
labels = labels.insert(0, tslib.NaT) | ||
|
||
return binner, bins, labels | ||
|
||
|
||
def _take_new_index(obj, indexer, new_index, axis=0): | ||
from pandas.core.api import Series, DataFrame | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the fixed things that are not changed (e.g. [1], [2]), do in a separate ipython block above (to avoid repeating in previous/new).