Skip to content

BUG:Time Grouper bug fix when applied for list groupers #17587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Oct 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,7 @@ Groupby/Resample/Rolling
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`)

Sparse
^^^^^^
Expand Down
119 changes: 94 additions & 25 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
def ax(self):
return self.grouper

def _get_grouper(self, obj):
def _get_grouper(self, obj, validate=True):
"""
Parameters
----------
obj : the subject object
validate : boolean, default True
if True, validate the grouper

Returns
-------
Expand All @@ -271,7 +273,8 @@ def _get_grouper(self, obj):
self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
axis=self.axis,
level=self.level,
sort=self.sort)
sort=self.sort,
validate=validate)
return self.binner, self.grouper, self.obj

def _set_grouper(self, obj, sort=False):
Expand Down Expand Up @@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False):
self.grouper = ax
return self.grouper

def _get_binner_for_grouping(self, obj):
""" default to the standard binner here """
group_axis = obj._get_axis(self.axis)
return Grouping(group_axis, None, obj=obj, name=self.key,
level=self.level, sort=self.sort, in_axis=False)

@property
def groups(self):
return self.grouper.groups
Expand Down Expand Up @@ -1733,16 +1730,34 @@ class BaseGrouper(object):
"""
This is an internal Grouper class, which actually holds
the generated groups

Parameters
----------
axis : int
the axis to group
groupings : array of grouping
all the grouping instances to handle in this grouper
for example for grouper list to groupby, need to pass the list
sort : boolean, default True
whether this grouper will give sorted result or not
group_keys : boolean, default True
mutated : boolean, default False
indexer : intp array, optional
the indexer created by Grouper
some groupers (TimeGrouper) will sort its axis and its
group_info is also sorted, so need the indexer to reorder

"""

def __init__(self, axis, groupings, sort=True, group_keys=True,
mutated=False):
mutated=False, indexer=None):
self._filter_empty_groups = self.compressed = len(groupings) != 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string explaining params (I know you just added 1 but good time)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry I don't really understand all parameters, I've added those I know😂

self.axis = axis
self.groupings = groupings
self.sort = sort
self.group_keys = group_keys
self.mutated = mutated
self.indexer = indexer

@property
def shape(self):
Expand Down Expand Up @@ -1888,6 +1903,15 @@ def group_info(self):
comp_ids = _ensure_int64(comp_ids)
return comp_ids, obs_group_ids, ngroups

@cache_readonly
def label_info(self):
# return the labels of items in original grouped axis
labels, _, _ = self.group_info
if self.indexer is not None:
sorter = np.lexsort((labels, self.indexer))
labels = labels[sorter]
return labels

def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if len(all_labels) > 1:
Expand Down Expand Up @@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed):

class BinGrouper(BaseGrouper):

def __init__(self, bins, binlabels, filter_empty=False, mutated=False):
"""
This is an internal Grouper class

Parameters
----------
bins : the split index of binlabels to group the item of axis
binlabels : the label list
filter_empty : boolean, default False
mutated : boolean, default False
indexer : a intp array

Examples
--------
bins: [2, 4, 6, 8, 10]
binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
'2005-01-05', '2005-01-07', '2005-01-09'],
dtype='datetime64[ns]', freq='2D')

the group_info, which contains the label of each item in grouped
axis, the index of label in label list, group number, is

(array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)

means that, the grouped axis has 10 items, can be grouped into 5
labels, the first and second items belong to the first label, the
third and forth items belong to the second label, and so on

"""

def __init__(self, bins, binlabels, filter_empty=False, mutated=False,
indexer=None):
self.bins = _ensure_int64(bins)
self.binlabels = _ensure_index(binlabels)
self._filter_empty_groups = filter_empty
self.mutated = mutated
self.indexer = indexer

@cache_readonly
def groups(self):
Expand Down Expand Up @@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.grouper, self._labels, self._group_index = \
index._get_grouper_for_level(self.grouper, level)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get labels
elif isinstance(self.grouper, Grouper):
# get the new grouper; we already have disambiguated
# what key/level refer to exactly, don't need to
# check again as we have by this point converted these
# to an actual value (rather than a pd.Grouper)
_, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
if self.name is None:
self.name = grouper.result_index.name
self.obj = self.grouper.obj
self.grouper = grouper

else:
if self.grouper is None and self.name is not None:
self.grouper = self.obj[self.name]
Expand All @@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
categories=c,
ordered=self.grouper.ordered))

# a passed Grouper like
elif isinstance(self.grouper, Grouper):

# get the new grouper
grouper = self.grouper._get_binner_for_grouping(self.obj)
self.obj = self.grouper.obj
self.grouper = grouper
if self.name is None:
self.name = grouper.name

# we are done
if isinstance(self.grouper, Grouping):
self.grouper = self.grouper.grouper
Expand Down Expand Up @@ -2536,6 +2594,10 @@ def ngroups(self):

@cache_readonly
def indices(self):
# we have a list of groupers
if isinstance(self.grouper, BaseGrouper):
return self.grouper.indices

values = _ensure_categorical(self.grouper)
return values._reverse_indexer()

Expand All @@ -2553,9 +2615,14 @@ def group_index(self):

def _make_labels(self):
if self._labels is None or self._group_index is None:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
# we have a list of groupers
if isinstance(self.grouper, BaseGrouper):
labels = self.grouper.label_info
uniques = self.grouper.result_index
else:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques

Expand All @@ -2566,7 +2633,7 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
mutated=False):
mutated=False, validate=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand All @@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
are and then creates a Grouping for each one, combined into
a BaseGrouper.

If validate, then check for key/level overlaps

"""
group_axis = obj._get_axis(axis)

Expand Down Expand Up @@ -2707,7 +2776,7 @@ def is_in_obj(gpr):

elif is_in_axis(gpr): # df.groupby('name')
if gpr in obj:
if gpr in obj.index.names:
if validate and gpr in obj.index.names:
warnings.warn(
("'%s' is both a column name and an index level.\n"
"Defaulting to column but "
Expand Down
27 changes: 2 additions & 25 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def _get_binner(self):
"""

binner, bins, binlabels = self._get_binner_for_time()
bin_grouper = BinGrouper(bins, binlabels)
bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
return binner, bin_grouper

def _assure_grouper(self):
Expand Down Expand Up @@ -1105,35 +1105,12 @@ def _get_resampler(self, obj, kind=None):
"TimedeltaIndex or PeriodIndex, "
"but got an instance of %r" % type(ax).__name__)

def _get_grouper(self, obj):
def _get_grouper(self, obj, validate=True):
# create the resampler and return our binner
r = self._get_resampler(obj)
r._set_binner()
return r.binner, r.grouper, r.obj

def _get_binner_for_grouping(self, obj):
# return an ordering of the transformed group labels,
# suitable for multi-grouping, e.g the labels for
# the resampled intervals
binner, grouper, obj = self._get_grouper(obj)

l = []
for key, group in grouper.get_iterator(self.ax):
l.extend([key] * len(group))

if isinstance(self.ax, PeriodIndex):
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
else:
# resampling causes duplicated values, specifying freq is invalid
grouper = binner.__class__(l, name=binner.name)

# since we may have had to sort
# may need to reorder groups here
if self.indexer is not None:
indexer = self.indexer.argsort(kind='quicksort')
grouper = grouper.take(indexer)
return grouper

def _get_time_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
raise TypeError('axis must be a DatetimeIndex, but got '
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,3 +623,22 @@ def test_nunique_with_timegrouper_and_nat(self):
result = test.groupby(grouper)['data'].nunique()
expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
tm.assert_series_equal(result, expected)

def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
'location': ['shanghai', 'beijing', 'shanghai'],
'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
'2017-08-11 22:23:15'],
dtype='datetime64[ns]'),
'value': [1, 2, 3]
}
data_frame = pd.DataFrame(data_frame).set_index('time')
grouper = pd.Grouper(freq='D')

grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()

assert_frame_equal(result, expected)