diff --git a/doc/source/release.rst b/doc/source/release.rst index 78678a0ee81a6..0302f944ed4af 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -97,6 +97,9 @@ Bug Fixes ``to_replace`` argument (:issue:`6332`) - Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`) - Bug in setting complex dtypes via boolean indexing (:issue:`6345`) +- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`) +- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`) +- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) pandas 0.13.1 ------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c0ea730e38a27..0919309afd434 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -217,8 +217,6 @@ def __init__(self, obj, keys=None, axis=0, level=None, if isinstance(obj, NDFrame): obj._consolidate_inplace() - self.obj = obj - self.axis = obj._get_axis_number(axis) self.level = level if not as_index: @@ -234,9 +232,11 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.squeeze = squeeze if grouper is None: - grouper, exclusions = _get_grouper(obj, keys, axis=axis, - level=level, sort=sort) + grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis, + level=level, sort=sort) + self.obj = obj + self.axis = obj._get_axis_number(axis) self.grouper = grouper self.exclusions = set(exclusions) if exclusions else set() @@ -1106,6 +1106,7 @@ def aggregate(self, values, how, axis=0): # will be filled in Cython function result = np.empty(out_shape, dtype=values.dtype) + result.fill(np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, how, is_numeric) @@ -1258,6 +1259,11 @@ class CustomGrouper(object): def get_grouper(self, obj): raise NotImplementedError + # delegates + @property + def groups(self): + return self.grouper.groups + class BinGrouper(Grouper): @@ -1266,6 +1272,14 @@ def __init__(self, bins, binlabels, filter_empty=False): self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty + @cache_readonly + def groups(self): + """ dict {group name -> group labels} """ + + # this is mainly for compat + # GH 3881 + return dict(zip(self.binlabels,self.bins)) + @property def nkeys(self): return 1 @@ -1560,10 +1574,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): key = group_axis if isinstance(key, CustomGrouper): - gpr = key.get_grouper(obj) - return gpr, [] + binner, gpr, obj = key.get_grouper(obj) + return gpr, [], obj elif isinstance(key, Grouper): - return key, [] + return key, [], obj if not isinstance(key, (tuple, list)): keys = [key] @@ -1623,7 +1637,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): grouper = Grouper(group_axis, groupings, sort=sort) - return grouper, exclusions + return grouper, exclusions, obj def _is_label_like(val): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index dbaf41a269ec2..ba06cebd20080 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2764,6 +2764,64 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + def test_groupby_with_timegrouper(self): + # GH 4161 + # TimeGrouper requires a sorted index + # also verifies that the resultant index has the correct name + import datetime as DT + df = DataFrame({ + 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), + 'Quantity': [18,3,5,1,9,3], + 'Date' : [ + DT.datetime(2013,9,1,13,0), + DT.datetime(2013,9,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,3,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,9,2,14,0), + ]}) + df = df.set_index(['Date']) + + expected = DataFrame({ 'Quantity' : np.nan }, + index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date')) + expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64') + + result1 = df.resample('5D',how=sum) + assert_frame_equal(result1, expected) + + df_sorted = df.sort_index() + result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result2, expected) + + result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result3, expected) + + def test_groupby_with_timegrouper_methods(self): + # GH 3881 + # make sure API of timegrouper conforms + + import datetime as DT + df = pd.DataFrame({ + 'Branch' : 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), + 'Quantity': [1,3,5,8,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}) + + df = df.set_index('Date', drop=False) + g = df.groupby(pd.TimeGrouper('6M')) + self.assertTrue(g.group_keys) + self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper)) + groups = g.groups + self.assertTrue(isinstance(groups,dict)) + self.assertTrue(len(groups) == 3) + def test_cumcount(self): df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) g = df.groupby('A') diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 5377543ac8c54..e4221fdea083c 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -72,17 +72,12 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.base = base def resample(self, obj): - axis = obj._get_axis(self.axis) + ax = obj._get_axis(self.axis) - if not axis.is_monotonic: - try: - obj = obj.sort_index(axis=self.axis) - except TypeError: - obj = obj.sort_index() - - if isinstance(axis, DatetimeIndex): + obj = self._ensure_sortedness(obj) + if isinstance(ax, DatetimeIndex): rs = self._resample_timestamps(obj) - elif isinstance(axis, PeriodIndex): + elif isinstance(ax, PeriodIndex): offset = to_offset(self.freq) if offset.n > 1: if self.kind == 'period': # pragma: no cover @@ -95,55 +90,68 @@ def resample(self, obj): else: obj = obj.to_timestamp(how=self.convention) rs = self._resample_timestamps(obj) - elif len(axis) == 0: + elif len(ax) == 0: return obj else: # pragma: no cover raise TypeError('Only valid with DatetimeIndex or PeriodIndex') rs_axis = rs._get_axis(self.axis) - rs_axis.name = axis.name + rs_axis.name = ax.name return rs def get_grouper(self, obj): - # Only return grouper - return self._get_time_grouper(obj)[1] + # return a tuple of (binner, grouper, obj) + return self._get_time_grouper(obj) + + def _ensure_sortedness(self, obj): + # ensure that our object is sorted + ax = obj._get_axis(self.axis) + if not ax.is_monotonic: + try: + obj = obj.sort_index(axis=self.axis) + except TypeError: + obj = obj.sort_index() + return obj def _get_time_grouper(self, obj): - axis = obj._get_axis(self.axis) + obj = self._ensure_sortedness(obj) + ax = obj._get_axis(self.axis) if self.kind is None or self.kind == 'timestamp': - binner, bins, binlabels = self._get_time_bins(axis) + binner, bins, binlabels = self._get_time_bins(ax) else: - binner, bins, binlabels = self._get_time_period_bins(axis) + binner, bins, binlabels = self._get_time_period_bins(ax) grouper = BinGrouper(bins, binlabels) - return binner, grouper + return binner, grouper, obj - def _get_time_bins(self, axis): - if not isinstance(axis, DatetimeIndex): + def _get_time_bins(self, ax): + if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(axis).__name__) + 'an instance of %r' % type(ax).__name__) - if len(axis) == 0: - binner = labels = DatetimeIndex(data=[], freq=self.freq) + if len(ax) == 0: + binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_range_edges(axis, self.freq, closed=self.closed, + first, last = _get_range_edges(ax, self.freq, closed=self.closed, base=self.base) - tz = axis.tz + tz = ax.tz binner = labels = DatetimeIndex(freq=self.freq, start=first.replace(tzinfo=None), - end=last.replace(tzinfo=None), tz=tz) + end=last.replace(tzinfo=None), + tz=tz, + name=ax.name) # a little hack trimmed = False - if (len(binner) > 2 and binner[-2] == axis[-1] and + if (len(binner) > 2 and binner[-2] == ax[-1] and self.closed == 'right'): binner = binner[:-1] trimmed = True - ax_values = axis.asi8 + ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) # general version, knowing nothing about relative frequencies @@ -180,22 +188,24 @@ def _adjust_bin_edges(self, binner, ax_values): return binner, bin_edges - def _get_time_period_bins(self, axis): - if not isinstance(axis, DatetimeIndex): + def _get_time_period_bins(self, ax): + if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(axis).__name__) + 'an instance of %r' % type(ax).__name__) - if not len(axis): - binner = labels = PeriodIndex(data=[], freq=self.freq) + if not len(ax): + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels - labels = binner = PeriodIndex(start=axis[0], end=axis[-1], - freq=self.freq) + labels = binner = PeriodIndex(start=ax[0], + end=ax[-1], + freq=self.freq, + name=ax.name) end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp() - if axis.tzinfo: - end_stamps = end_stamps.tz_localize(axis.tzinfo) - bins = axis.searchsorted(end_stamps, side='left') + if ax.tzinfo: + end_stamps = end_stamps.tz_localize(ax.tzinfo) + bins = ax.searchsorted(end_stamps, side='left') return binner, bins, labels @@ -206,7 +216,7 @@ def _agg_method(self): def _resample_timestamps(self, obj): axlabels = obj._get_axis(self.axis) - binner, grouper = self._get_time_grouper(obj) + binner, grouper, _ = self._get_time_grouper(obj) # Determine if we're downsampling if axlabels.freq is not None or axlabels.inferred_freq is not None: diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 59be80f7012d7..7e9433ac41ddd 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1100,7 +1100,7 @@ def test_apply_iteration(self): df = DataFrame({'open': 1, 'close': 2}, index=ind) tg = TimeGrouper('M') - grouper = tg.get_grouper(df) + _, grouper, _ = tg.get_grouper(df) # Errors @@ -1118,7 +1118,7 @@ def test_panel_aggregation(self): minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) - grouper = tg.get_grouper(wp) + _, grouper, _ = tg.get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean()