Skip to content

BUG: TimeGrouper sortedness / API fix (GH4161,GH3881) #6350

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ Bug Fixes
``to_replace`` argument (:issue:`6332`)
- Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`)
- Bug in setting complex dtypes via boolean indexing (:issue:`6345`)
- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex would return invalid results. (:issue:`4161`)
- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`)
- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`)

pandas 0.13.1
-------------
Expand Down
30 changes: 22 additions & 8 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,6 @@ def __init__(self, obj, keys=None, axis=0, level=None,
if isinstance(obj, NDFrame):
obj._consolidate_inplace()

self.obj = obj
self.axis = obj._get_axis_number(axis)
self.level = level

if not as_index:
Expand All @@ -234,9 +232,11 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.squeeze = squeeze

if grouper is None:
grouper, exclusions = _get_grouper(obj, keys, axis=axis,
level=level, sort=sort)
grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis,
level=level, sort=sort)

self.obj = obj
self.axis = obj._get_axis_number(axis)
self.grouper = grouper
self.exclusions = set(exclusions) if exclusions else set()

Expand Down Expand Up @@ -1106,6 +1106,7 @@ def aggregate(self, values, how, axis=0):

# will be filled in Cython function
result = np.empty(out_shape, dtype=values.dtype)
result.fill(np.nan)
counts = np.zeros(self.ngroups, dtype=np.int64)

result = self._aggregate(result, counts, values, how, is_numeric)
Expand Down Expand Up @@ -1258,6 +1259,11 @@ class CustomGrouper(object):
def get_grouper(self, obj):
raise NotImplementedError

# delegates
@property
def groups(self):
return self.grouper.groups


class BinGrouper(Grouper):

Expand All @@ -1266,6 +1272,14 @@ def __init__(self, bins, binlabels, filter_empty=False):
self.binlabels = _ensure_index(binlabels)
self._filter_empty_groups = filter_empty

@cache_readonly
def groups(self):
""" dict {group name -> group labels} """

# this is mainly for compat
# GH 3881
return dict(zip(self.binlabels,self.bins))

@property
def nkeys(self):
return 1
Expand Down Expand Up @@ -1560,10 +1574,10 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
key = group_axis

if isinstance(key, CustomGrouper):
gpr = key.get_grouper(obj)
return gpr, []
binner, gpr, obj = key.get_grouper(obj)
return gpr, [], obj
elif isinstance(key, Grouper):
return key, []
return key, [], obj

if not isinstance(key, (tuple, list)):
keys = [key]
Expand Down Expand Up @@ -1623,7 +1637,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):

grouper = Grouper(group_axis, groupings, sort=sort)

return grouper, exclusions
return grouper, exclusions, obj


def _is_label_like(val):
Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2764,6 +2764,64 @@ def test_groupby_with_empty(self):
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None

def test_groupby_with_timegrouper(self):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
import datetime as DT
df = DataFrame({
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
'Quantity': [18,3,5,1,9,3],
'Date' : [
DT.datetime(2013,9,1,13,0),
DT.datetime(2013,9,1,13,5),
DT.datetime(2013,10,1,20,0),
DT.datetime(2013,10,3,10,0),
DT.datetime(2013,12,2,12,0),
DT.datetime(2013,9,2,14,0),
]})
df = df.set_index(['Date'])

expected = DataFrame({ 'Quantity' : np.nan },
index=date_range('20130901 13:00:00','20131205 13:00:00',freq='5D',name='Date'))
expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64')

result1 = df.resample('5D',how=sum)
assert_frame_equal(result1, expected)

df_sorted = df.sort_index()
result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result2, expected)

result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum()
assert_frame_equal(result3, expected)

def test_groupby_with_timegrouper_methods(self):
# GH 3881
# make sure API of timegrouper conforms

import datetime as DT
df = pd.DataFrame({
'Branch' : 'A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
'Quantity': [1,3,5,8,9,3],
'Date' : [
DT.datetime(2013,1,1,13,0),
DT.datetime(2013,1,1,13,5),
DT.datetime(2013,10,1,20,0),
DT.datetime(2013,10,2,10,0),
DT.datetime(2013,12,2,12,0),
DT.datetime(2013,12,2,14,0),
]})

df = df.set_index('Date', drop=False)
g = df.groupby(pd.TimeGrouper('6M'))
self.assertTrue(g.group_keys)
self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper))
groups = g.groups
self.assertTrue(isinstance(groups,dict))
self.assertTrue(len(groups) == 3)

def test_cumcount(self):
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
g = df.groupby('A')
Expand Down
86 changes: 48 additions & 38 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,12 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
self.base = base

def resample(self, obj):
axis = obj._get_axis(self.axis)
ax = obj._get_axis(self.axis)

if not axis.is_monotonic:
try:
obj = obj.sort_index(axis=self.axis)
except TypeError:
obj = obj.sort_index()

if isinstance(axis, DatetimeIndex):
obj = self._ensure_sortedness(obj)
if isinstance(ax, DatetimeIndex):
rs = self._resample_timestamps(obj)
elif isinstance(axis, PeriodIndex):
elif isinstance(ax, PeriodIndex):
offset = to_offset(self.freq)
if offset.n > 1:
if self.kind == 'period': # pragma: no cover
Expand All @@ -95,55 +90,68 @@ def resample(self, obj):
else:
obj = obj.to_timestamp(how=self.convention)
rs = self._resample_timestamps(obj)
elif len(axis) == 0:
elif len(ax) == 0:
return obj
else: # pragma: no cover
raise TypeError('Only valid with DatetimeIndex or PeriodIndex')

rs_axis = rs._get_axis(self.axis)
rs_axis.name = axis.name
rs_axis.name = ax.name
return rs

def get_grouper(self, obj):
# Only return grouper
return self._get_time_grouper(obj)[1]
# return a tuple of (binner, grouper, obj)
return self._get_time_grouper(obj)

def _ensure_sortedness(self, obj):
# ensure that our object is sorted
ax = obj._get_axis(self.axis)
if not ax.is_monotonic:
try:
obj = obj.sort_index(axis=self.axis)
except TypeError:
obj = obj.sort_index()
return obj

def _get_time_grouper(self, obj):
axis = obj._get_axis(self.axis)
obj = self._ensure_sortedness(obj)
ax = obj._get_axis(self.axis)

if self.kind is None or self.kind == 'timestamp':
binner, bins, binlabels = self._get_time_bins(axis)
binner, bins, binlabels = self._get_time_bins(ax)
else:
binner, bins, binlabels = self._get_time_period_bins(axis)
binner, bins, binlabels = self._get_time_period_bins(ax)

grouper = BinGrouper(bins, binlabels)
return binner, grouper
return binner, grouper, obj

def _get_time_bins(self, axis):
if not isinstance(axis, DatetimeIndex):
def _get_time_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
raise TypeError('axis must be a DatetimeIndex, but got '
'an instance of %r' % type(axis).__name__)
'an instance of %r' % type(ax).__name__)

if len(axis) == 0:
binner = labels = DatetimeIndex(data=[], freq=self.freq)
if len(ax) == 0:
binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels

first, last = _get_range_edges(axis, self.freq, closed=self.closed,
first, last = _get_range_edges(ax, self.freq, closed=self.closed,
base=self.base)
tz = axis.tz
tz = ax.tz
binner = labels = DatetimeIndex(freq=self.freq,
start=first.replace(tzinfo=None),
end=last.replace(tzinfo=None), tz=tz)
end=last.replace(tzinfo=None),
tz=tz,
name=ax.name)

# a little hack
trimmed = False
if (len(binner) > 2 and binner[-2] == axis[-1] and
if (len(binner) > 2 and binner[-2] == ax[-1] and
self.closed == 'right'):

binner = binner[:-1]
trimmed = True

ax_values = axis.asi8
ax_values = ax.asi8
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)

# general version, knowing nothing about relative frequencies
Expand Down Expand Up @@ -180,22 +188,24 @@ def _adjust_bin_edges(self, binner, ax_values):

return binner, bin_edges

def _get_time_period_bins(self, axis):
if not isinstance(axis, DatetimeIndex):
def _get_time_period_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
raise TypeError('axis must be a DatetimeIndex, but got '
'an instance of %r' % type(axis).__name__)
'an instance of %r' % type(ax).__name__)

if not len(axis):
binner = labels = PeriodIndex(data=[], freq=self.freq)
if not len(ax):
binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels

labels = binner = PeriodIndex(start=axis[0], end=axis[-1],
freq=self.freq)
labels = binner = PeriodIndex(start=ax[0],
end=ax[-1],
freq=self.freq,
name=ax.name)

end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp()
if axis.tzinfo:
end_stamps = end_stamps.tz_localize(axis.tzinfo)
bins = axis.searchsorted(end_stamps, side='left')
if ax.tzinfo:
end_stamps = end_stamps.tz_localize(ax.tzinfo)
bins = ax.searchsorted(end_stamps, side='left')

return binner, bins, labels

Expand All @@ -206,7 +216,7 @@ def _agg_method(self):
def _resample_timestamps(self, obj):
axlabels = obj._get_axis(self.axis)

binner, grouper = self._get_time_grouper(obj)
binner, grouper, _ = self._get_time_grouper(obj)

# Determine if we're downsampling
if axlabels.freq is not None or axlabels.inferred_freq is not None:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tseries/tests/test_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ def test_apply_iteration(self):
df = DataFrame({'open': 1, 'close': 2}, index=ind)
tg = TimeGrouper('M')

grouper = tg.get_grouper(df)
_, grouper, _ = tg.get_grouper(df)

# Errors

Expand All @@ -1118,7 +1118,7 @@ def test_panel_aggregation(self):
minor_axis=['A', 'B', 'C', 'D'])

tg = TimeGrouper('M', axis=1)
grouper = tg.get_grouper(wp)
_, grouper, _ = tg.get_grouper(wp)
bingrouped = wp.groupby(grouper)
binagg = bingrouped.mean()

Expand Down