Skip to content

Commit cdbbf80

Browse files
ruiannjreback
authored andcommitted
BUG:Time Grouper bug fix when applied for list groupers (#17587)
closes #17530
1 parent 7d4a260 commit cdbbf80

File tree

4 files changed

+116
-50
lines changed

4 files changed

+116
-50
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,7 @@ Groupby/Resample/Rolling
704704
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
705705
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
706706
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
707+
- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`)
707708

708709
Sparse
709710
^^^^^^

pandas/core/groupby.py

+94-25
Original file line numberDiff line numberDiff line change
@@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
256256
def ax(self):
257257
return self.grouper
258258

259-
def _get_grouper(self, obj):
259+
def _get_grouper(self, obj, validate=True):
260260
"""
261261
Parameters
262262
----------
263263
obj : the subject object
264+
validate : boolean, default True
265+
if True, validate the grouper
264266
265267
Returns
266268
-------
@@ -271,7 +273,8 @@ def _get_grouper(self, obj):
271273
self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
272274
axis=self.axis,
273275
level=self.level,
274-
sort=self.sort)
276+
sort=self.sort,
277+
validate=validate)
275278
return self.binner, self.grouper, self.obj
276279

277280
def _set_grouper(self, obj, sort=False):
@@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False):
326329
self.grouper = ax
327330
return self.grouper
328331

329-
def _get_binner_for_grouping(self, obj):
330-
""" default to the standard binner here """
331-
group_axis = obj._get_axis(self.axis)
332-
return Grouping(group_axis, None, obj=obj, name=self.key,
333-
level=self.level, sort=self.sort, in_axis=False)
334-
335332
@property
336333
def groups(self):
337334
return self.grouper.groups
@@ -1733,16 +1730,34 @@ class BaseGrouper(object):
17331730
"""
17341731
This is an internal Grouper class, which actually holds
17351732
the generated groups
1733+
1734+
Parameters
1735+
----------
1736+
axis : int
1737+
the axis to group
1738+
groupings : array of grouping
1739+
all the grouping instances to handle in this grouper
1740+
for example for grouper list to groupby, need to pass the list
1741+
sort : boolean, default True
1742+
whether this grouper will give sorted result or not
1743+
group_keys : boolean, default True
1744+
mutated : boolean, default False
1745+
indexer : intp array, optional
1746+
the indexer created by Grouper
1747+
some groupers (TimeGrouper) will sort its axis and its
1748+
group_info is also sorted, so need the indexer to reorder
1749+
17361750
"""
17371751

17381752
def __init__(self, axis, groupings, sort=True, group_keys=True,
1739-
mutated=False):
1753+
mutated=False, indexer=None):
17401754
self._filter_empty_groups = self.compressed = len(groupings) != 1
17411755
self.axis = axis
17421756
self.groupings = groupings
17431757
self.sort = sort
17441758
self.group_keys = group_keys
17451759
self.mutated = mutated
1760+
self.indexer = indexer
17461761

17471762
@property
17481763
def shape(self):
@@ -1888,6 +1903,15 @@ def group_info(self):
18881903
comp_ids = _ensure_int64(comp_ids)
18891904
return comp_ids, obs_group_ids, ngroups
18901905

1906+
@cache_readonly
1907+
def label_info(self):
1908+
# return the labels of items in original grouped axis
1909+
labels, _, _ = self.group_info
1910+
if self.indexer is not None:
1911+
sorter = np.lexsort((labels, self.indexer))
1912+
labels = labels[sorter]
1913+
return labels
1914+
18911915
def _get_compressed_labels(self):
18921916
all_labels = [ping.labels for ping in self.groupings]
18931917
if len(all_labels) > 1:
@@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed):
22882312

22892313
class BinGrouper(BaseGrouper):
22902314

2291-
def __init__(self, bins, binlabels, filter_empty=False, mutated=False):
2315+
"""
2316+
This is an internal Grouper class
2317+
2318+
Parameters
2319+
----------
2320+
bins : the split index of binlabels to group the item of axis
2321+
binlabels : the label list
2322+
filter_empty : boolean, default False
2323+
mutated : boolean, default False
2324+
indexer : a intp array
2325+
2326+
Examples
2327+
--------
2328+
bins: [2, 4, 6, 8, 10]
2329+
binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
2330+
'2005-01-05', '2005-01-07', '2005-01-09'],
2331+
dtype='datetime64[ns]', freq='2D')
2332+
2333+
the group_info, which contains the label of each item in grouped
2334+
axis, the index of label in label list, group number, is
2335+
2336+
(array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
2337+
2338+
means that, the grouped axis has 10 items, can be grouped into 5
2339+
labels, the first and second items belong to the first label, the
2340+
third and forth items belong to the second label, and so on
2341+
2342+
"""
2343+
2344+
def __init__(self, bins, binlabels, filter_empty=False, mutated=False,
2345+
indexer=None):
22922346
self.bins = _ensure_int64(bins)
22932347
self.binlabels = _ensure_index(binlabels)
22942348
self._filter_empty_groups = filter_empty
22952349
self.mutated = mutated
2350+
self.indexer = indexer
22962351

22972352
@cache_readonly
22982353
def groups(self):
@@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
24602515
self.grouper, self._labels, self._group_index = \
24612516
index._get_grouper_for_level(self.grouper, level)
24622517

2518+
# a passed Grouper like, directly get the grouper in the same way
2519+
# as single grouper groupby, use the group_info to get labels
2520+
elif isinstance(self.grouper, Grouper):
2521+
# get the new grouper; we already have disambiguated
2522+
# what key/level refer to exactly, don't need to
2523+
# check again as we have by this point converted these
2524+
# to an actual value (rather than a pd.Grouper)
2525+
_, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
2526+
if self.name is None:
2527+
self.name = grouper.result_index.name
2528+
self.obj = self.grouper.obj
2529+
self.grouper = grouper
2530+
24632531
else:
24642532
if self.grouper is None and self.name is not None:
24652533
self.grouper = self.obj[self.name]
@@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
24822550
categories=c,
24832551
ordered=self.grouper.ordered))
24842552

2485-
# a passed Grouper like
2486-
elif isinstance(self.grouper, Grouper):
2487-
2488-
# get the new grouper
2489-
grouper = self.grouper._get_binner_for_grouping(self.obj)
2490-
self.obj = self.grouper.obj
2491-
self.grouper = grouper
2492-
if self.name is None:
2493-
self.name = grouper.name
2494-
24952553
# we are done
24962554
if isinstance(self.grouper, Grouping):
24972555
self.grouper = self.grouper.grouper
@@ -2536,6 +2594,10 @@ def ngroups(self):
25362594

25372595
@cache_readonly
25382596
def indices(self):
2597+
# we have a list of groupers
2598+
if isinstance(self.grouper, BaseGrouper):
2599+
return self.grouper.indices
2600+
25392601
values = _ensure_categorical(self.grouper)
25402602
return values._reverse_indexer()
25412603

@@ -2553,9 +2615,14 @@ def group_index(self):
25532615

25542616
def _make_labels(self):
25552617
if self._labels is None or self._group_index is None:
2556-
labels, uniques = algorithms.factorize(
2557-
self.grouper, sort=self.sort)
2558-
uniques = Index(uniques, name=self.name)
2618+
# we have a list of groupers
2619+
if isinstance(self.grouper, BaseGrouper):
2620+
labels = self.grouper.label_info
2621+
uniques = self.grouper.result_index
2622+
else:
2623+
labels, uniques = algorithms.factorize(
2624+
self.grouper, sort=self.sort)
2625+
uniques = Index(uniques, name=self.name)
25592626
self._labels = labels
25602627
self._group_index = uniques
25612628

@@ -2566,7 +2633,7 @@ def groups(self):
25662633

25672634

25682635
def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
2569-
mutated=False):
2636+
mutated=False, validate=True):
25702637
"""
25712638
create and return a BaseGrouper, which is an internal
25722639
mapping of how to create the grouper indexers.
@@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
25832650
are and then creates a Grouping for each one, combined into
25842651
a BaseGrouper.
25852652
2653+
If validate, then check for key/level overlaps
2654+
25862655
"""
25872656
group_axis = obj._get_axis(axis)
25882657

@@ -2707,7 +2776,7 @@ def is_in_obj(gpr):
27072776

27082777
elif is_in_axis(gpr): # df.groupby('name')
27092778
if gpr in obj:
2710-
if gpr in obj.index.names:
2779+
if validate and gpr in obj.index.names:
27112780
warnings.warn(
27122781
("'%s' is both a column name and an index level.\n"
27132782
"Defaulting to column but "

pandas/core/resample.py

+2-25
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def _get_binner(self):
250250
"""
251251

252252
binner, bins, binlabels = self._get_binner_for_time()
253-
bin_grouper = BinGrouper(bins, binlabels)
253+
bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
254254
return binner, bin_grouper
255255

256256
def _assure_grouper(self):
@@ -1105,35 +1105,12 @@ def _get_resampler(self, obj, kind=None):
11051105
"TimedeltaIndex or PeriodIndex, "
11061106
"but got an instance of %r" % type(ax).__name__)
11071107

1108-
def _get_grouper(self, obj):
1108+
def _get_grouper(self, obj, validate=True):
11091109
# create the resampler and return our binner
11101110
r = self._get_resampler(obj)
11111111
r._set_binner()
11121112
return r.binner, r.grouper, r.obj
11131113

1114-
def _get_binner_for_grouping(self, obj):
1115-
# return an ordering of the transformed group labels,
1116-
# suitable for multi-grouping, e.g the labels for
1117-
# the resampled intervals
1118-
binner, grouper, obj = self._get_grouper(obj)
1119-
1120-
l = []
1121-
for key, group in grouper.get_iterator(self.ax):
1122-
l.extend([key] * len(group))
1123-
1124-
if isinstance(self.ax, PeriodIndex):
1125-
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
1126-
else:
1127-
# resampling causes duplicated values, specifying freq is invalid
1128-
grouper = binner.__class__(l, name=binner.name)
1129-
1130-
# since we may have had to sort
1131-
# may need to reorder groups here
1132-
if self.indexer is not None:
1133-
indexer = self.indexer.argsort(kind='quicksort')
1134-
grouper = grouper.take(indexer)
1135-
return grouper
1136-
11371114
def _get_time_bins(self, ax):
11381115
if not isinstance(ax, DatetimeIndex):
11391116
raise TypeError('axis must be a DatetimeIndex, but got '

pandas/tests/groupby/test_timegrouper.py

+19
Original file line numberDiff line numberDiff line change
@@ -623,3 +623,22 @@ def test_nunique_with_timegrouper_and_nat(self):
623623
result = test.groupby(grouper)['data'].nunique()
624624
expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
625625
tm.assert_series_equal(result, expected)
626+
627+
def test_scalar_call_versus_list_call(self):
628+
# Issue: 17530
629+
data_frame = {
630+
'location': ['shanghai', 'beijing', 'shanghai'],
631+
'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
632+
'2017-08-11 22:23:15'],
633+
dtype='datetime64[ns]'),
634+
'value': [1, 2, 3]
635+
}
636+
data_frame = pd.DataFrame(data_frame).set_index('time')
637+
grouper = pd.Grouper(freq='D')
638+
639+
grouped = data_frame.groupby(grouper)
640+
result = grouped.count()
641+
grouped = data_frame.groupby([grouper])
642+
expected = grouped.count()
643+
644+
assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)