From 6838d9caf49e73f8bc9cc23535a0f8ad89a604cd Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Mon, 18 Sep 2017 19:03:35 +0800 Subject: [PATCH 01/10] use grouper if given the list if grouper --- pandas/core/groupby.py | 42 ++++++++++++++++++----------------- pandas/core/resample.py | 23 ------------------- pandas/tests/test_resample.py | 14 ++++++++++++ 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2f2056279558d..5d64701b34255 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -326,12 +326,6 @@ def _set_grouper(self, obj, sort=False): self.grouper = ax return self.grouper - def _get_binner_for_grouping(self, obj): - """ default to the standard binner here """ - group_axis = obj._get_axis(self.axis) - return Grouping(group_axis, None, obj=obj, name=self.key, - level=self.level, sort=self.sort, in_axis=False) - @property def groups(self): return self.grouper.groups @@ -2460,6 +2454,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) + # a passed Grouper like + elif isinstance(self.grouper, Grouper): + # get the new grouper + _, grouper, _ = self.grouper._get_grouper(self.obj) + if self.name is None: + self.name = grouper.result_index.name + self.obj = self.grouper.obj + self.grouper = grouper + else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] @@ -2482,15 +2485,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, categories=c, ordered=self.grouper.ordered)) - # a passed Grouper like - elif isinstance(self.grouper, Grouper): - - # get the new grouper - grouper = self.grouper._get_binner_for_grouping(self.obj) - self.obj = self.grouper.obj - self.grouper = grouper - if self.name is None: - self.name = grouper.name # we are done if isinstance(self.grouper, Grouping): @@ -2536,8 +2530,11 @@ def ngroups(self): @cache_readonly def indices(self): - values = _ensure_categorical(self.grouper) - return values._reverse_indexer() + if isinstance(self.grouper, BaseGrouper): + return self.grouper.indices + else: + values = _ensure_categorical(self.grouper) + return values._reverse_indexer() @property def labels(self): @@ -2553,9 +2550,14 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: - labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) - uniques = Index(uniques, name=self.name) + # for the situation of groupby list of groupers + if isinstance(self.grouper, BaseGrouper): + labels, _, _ = self.grouper.group_info + uniques = self.grouper.result_index + else: + labels, uniques = algorithms.factorize( + self.grouper, sort=self.sort) + uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 083fbcaaabe46..0d6ace747a9e2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1111,29 +1111,6 @@ def _get_grouper(self, obj): r._set_binner() return r.binner, r.grouper, r.obj - def _get_binner_for_grouping(self, obj): - # return an ordering of the transformed group labels, - # suitable for multi-grouping, e.g the labels for - # the resampled intervals - binner, grouper, obj = self._get_grouper(obj) - - l = [] - for key, group in grouper.get_iterator(self.ax): - l.extend([key] * len(group)) - - if isinstance(self.ax, PeriodIndex): - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) - else: - # resampling causes duplicated values, specifying freq is invalid - grouper = binner.__class__(l, name=binner.name) - - # since we may have had to sort - # may need to reorder groups here - if self.indexer is not None: - indexer = self.indexer.argsort(kind='quicksort') - grouper = grouper.take(indexer) - return grouper - def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index cd15203eccd82..0ee760842b834 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3380,3 +3380,17 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + + def test_scalar_call_versus_list_call(self): + data_frame = pd.DataFrame({ + 'location': ['shanghai', 'beijing', 'shanghai'], + 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', '2017-08-11 22:23:15'], + dtype='datetime64[ns]'), + 'value': [1, 2, 3] + }).set_index('time') + grouper = TimeGrouper('D') + grouped = data_frame.groupby(grouper) + data1 = grouped.count() + grouped = data_frame.groupby([grouper]) + data2 = grouped.count() + assert_frame_equal(data1, data2) From 66febe4e54260f0875526fa193423360715e2421 Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Tue, 19 Sep 2017 11:01:09 +0800 Subject: [PATCH 02/10] BinGrouper holds the sorted binners, give the indexer to reorder axis if needed, eg. build grouping labels --- pandas/core/groupby.py | 9 +++++++-- pandas/core/resample.py | 11 +++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5d64701b34255..aef4beffd69f4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1730,13 +1730,14 @@ class BaseGrouper(object): """ def __init__(self, axis, groupings, sort=True, group_keys=True, - mutated=False): + mutated=False, indexer=None): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings self.sort = sort self.group_keys = group_keys self.mutated = mutated + self.indexer = indexer @property def shape(self): @@ -2282,11 +2283,12 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): - def __init__(self, bins, binlabels, filter_empty=False, mutated=False): + def __init__(self, bins, binlabels, filter_empty=False, mutated=False, indexer=None): self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated + self.indexer = indexer @cache_readonly def groups(self): @@ -2554,6 +2556,9 @@ def _make_labels(self): if isinstance(self.grouper, BaseGrouper): labels, _, _ = self.grouper.group_info uniques = self.grouper.result_index + if self.grouper.indexer is not None: + sorter = np.lexsort((labels, self.grouper.indexer)) + labels = labels[sorter] else: labels, uniques = algorithms.factorize( self.grouper, sort=self.sort) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0d6ace747a9e2..316830e75a88b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -250,7 +250,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() - bin_grouper = BinGrouper(bins, binlabels) + bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper def _assure_grouper(self): @@ -879,7 +879,14 @@ def _downsample(self, how, **kwargs): if is_subperiod(ax.freq, self.freq): # Downsampling - return self._groupby_and_aggregate(how, grouper=self.grouper) + if len(new_index) == 0: + bins = [] + else: + i8 = memb.asi8 + rng = np.arange(i8[0], i8[-1] + 1) + bins = memb.searchsorted(rng, side='right') + grouper = BinGrouper(bins, new_index, indexer=self.groupby.indexer) + return self._groupby_and_aggregate(how, grouper=grouper) elif is_superperiod(ax.freq, self.freq): if how == 'ohlc': # GH #13083 From 0ff386f4e4af02f411c332e93cdd87ba40131eac Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Tue, 19 Sep 2017 11:19:37 +0800 Subject: [PATCH 03/10] PEP8 fix --- pandas/core/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index aef4beffd69f4..7710705b0a835 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2283,7 +2283,8 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): - def __init__(self, bins, binlabels, filter_empty=False, mutated=False, indexer=None): + def __init__(self, bins, binlabels, filter_empty=False, mutated=False, + indexer=None): self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty @@ -2487,7 +2488,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, categories=c, ordered=self.grouper.ordered)) - # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper From 00a18e3366ae577ff0523867dbcaa0c362cd6e8e Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Wed, 20 Sep 2017 11:00:12 +0800 Subject: [PATCH 04/10] PEP8 & comment --- pandas/core/groupby.py | 45 ++++++++++++++++++++++++++++++++++- pandas/tests/test_resample.py | 15 +++++++----- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7710705b0a835..f9b3a8008dece 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1727,6 +1727,18 @@ class BaseGrouper(object): """ This is an internal Grouper class, which actually holds the generated groups + + Parameters + ---------- + axis : the axis to group + groupings : all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : True/False + whether this grouper will give sorted result or not + indexer: the indexer created by Grouper + some grouper (TimeGrouper eg) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + """ def __init__(self, axis, groupings, sort=True, group_keys=True, @@ -2283,6 +2295,35 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): + """ + This is an internal Grouper class, which actually holds + the generated groups. In contrast with BaseGrouper, + BinGrouper get the sorted bins and binlabels to compute group_info + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + indexer: the indexer created by Grouper + some grouper (TimeGrouper eg) will sort its axis and the + group_info of BinGrouper is also sorted + can use the indexer to reorder as the unsorted axis + + Examples + -------- + bins is [2, 4, 6, 8, 10] + binlabels is DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + then the group_info is + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means the label of each item in axis, the index of label in label + list, group number + + """ + def __init__(self, bins, binlabels, filter_empty=False, mutated=False, indexer=None): self.bins = _ensure_int64(bins) @@ -2457,7 +2498,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) - # a passed Grouper like + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper _, grouper, _ = self.grouper._get_grouper(self.obj) @@ -2532,6 +2574,7 @@ def ngroups(self): @cache_readonly def indices(self): + # for the situation of groupby list of groupers if isinstance(self.grouper, BaseGrouper): return self.grouper.indices else: diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 0ee760842b834..e924e2105e794 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3381,13 +3381,16 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + # Issue: 17530 def test_scalar_call_versus_list_call(self): - data_frame = pd.DataFrame({ - 'location': ['shanghai', 'beijing', 'shanghai'], - 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', '2017-08-11 22:23:15'], - dtype='datetime64[ns]'), - 'value': [1, 2, 3] - }).set_index('time') + data_frame = { + 'location': ['shanghai', 'beijing', 'shanghai'], + 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', + '2017-08-11 22:23:15'], + dtype='datetime64[ns]'), + 'value': [1, 2, 3] + } + data_frame = pd.DataFrame(data_frame).set_index('time') grouper = TimeGrouper('D') grouped = data_frame.groupby(grouper) data1 = grouped.count() From cc1a4ac5d505cfa66d45c322367f24f4fbad9eaf Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Fri, 22 Sep 2017 17:52:30 +0800 Subject: [PATCH 05/10] comment --- pandas/core/groupby.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f9b3a8008dece..14dead59baa5c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1733,9 +1733,11 @@ class BaseGrouper(object): axis : the axis to group groupings : all the grouping instances to handle in this grouper for example for grouper list to groupby, need to pass the list - sort : True/False + sort : boolean, default True whether this grouper will give sorted result or not - indexer: the indexer created by Grouper + group_keys : boolean, default True + mutated : boolean, default False + indexer : the indexer created by Grouper some grouper (TimeGrouper eg) will sort its axis and its group_info is also sorted, so need the indexer to reorder @@ -2296,18 +2298,15 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): """ - This is an internal Grouper class, which actually holds - the generated groups. In contrast with BaseGrouper, - BinGrouper get the sorted bins and binlabels to compute group_info + This is an internal Grouper class Parameters ---------- bins : the split index of binlabels to group the item of axis binlabels : the label list - indexer: the indexer created by Grouper - some grouper (TimeGrouper eg) will sort its axis and the - group_info of BinGrouper is also sorted - can use the indexer to reorder as the unsorted axis + filter_empty : boolean, default False + mutated : boolean, default False + indexer : a intp array Examples -------- From 6927207f9a444c25e8569f3b5d04a7b920f5af5a Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Mon, 25 Sep 2017 10:56:53 +0800 Subject: [PATCH 06/10] new method for BaseGrouper and inherited class --- pandas/core/groupby.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 14dead59baa5c..461f7df402bfa 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1897,6 +1897,15 @@ def group_info(self): comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups + # 17530 + @cache_readonly + def label_info(self): + labels, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((labels, self.indexer)) + labels = labels[sorter] + return labels + def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: @@ -2596,11 +2605,8 @@ def _make_labels(self): if self._labels is None or self._group_index is None: # for the situation of groupby list of groupers if isinstance(self.grouper, BaseGrouper): - labels, _, _ = self.grouper.group_info + labels = self.grouper.label_info uniques = self.grouper.result_index - if self.grouper.indexer is not None: - sorter = np.lexsort((labels, self.grouper.indexer)) - labels = labels[sorter] else: labels, uniques = algorithms.factorize( self.grouper, sort=self.sort) From 3ce56bda2464fc21f850f4f382eabadb35d57c71 Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Tue, 26 Sep 2017 16:03:24 +0800 Subject: [PATCH 07/10] comment --- pandas/core/groupby.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 461f7df402bfa..eb6da61cc756f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1730,8 +1730,10 @@ class BaseGrouper(object): Parameters ---------- - axis : the axis to group - groupings : all the grouping instances to handle in this grouper + axis : int + the axis to group + groupings : array of grouping + all the grouping instances to handle in this grouper for example for grouper list to groupby, need to pass the list sort : boolean, default True whether this grouper will give sorted result or not @@ -1897,9 +1899,9 @@ def group_info(self): comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups - # 17530 @cache_readonly def label_info(self): + # return the labels of items in original grouped axis labels, _, _ = self.group_info if self.indexer is not None: sorter = np.lexsort((labels, self.indexer)) @@ -2319,16 +2321,19 @@ class BinGrouper(BaseGrouper): Examples -------- - bins is [2, 4, 6, 8, 10] - binlabels is DatetimeIndex(['2005-01-01', '2005-01-03', + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', '2005-01-05', '2005-01-07', '2005-01-09'], dtype='datetime64[ns]', freq='2D') - then the group_info is + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) - means the label of each item in axis, the index of label in label - list, group number + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on """ From 737c00ee783cfd1c5169a149ca2b56444f4b50e0 Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Thu, 28 Sep 2017 19:48:31 +0800 Subject: [PATCH 08/10] move test to test time grouper --- pandas/tests/groupby/test_timegrouper.py | 17 +++++++++++++++++ pandas/tests/test_resample.py | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index fafcbf947e3df..938baadaf80ab 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -623,3 +623,20 @@ def test_nunique_with_timegrouper_and_nat(self): result = test.groupby(grouper)['data'].nunique() expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() tm.assert_series_equal(result, expected) + + def test_scalar_call_versus_list_call(self): + # Issue: 17530 + data_frame = { + 'location': ['shanghai', 'beijing', 'shanghai'], + 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', + '2017-08-11 22:23:15'], + dtype='datetime64[ns]'), + 'value': [1, 2, 3] + } + data_frame = pd.DataFrame(data_frame).set_index('time') + grouper = pd.TimeGrouper('D') + grouped = data_frame.groupby(grouper) + data1 = grouped.count() + grouped = data_frame.groupby([grouper]) + data2 = grouped.count() + assert_frame_equal(data1, data2) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index e924e2105e794..cd15203eccd82 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3380,20 +3380,3 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet - - # Issue: 17530 - def test_scalar_call_versus_list_call(self): - data_frame = { - 'location': ['shanghai', 'beijing', 'shanghai'], - 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', - '2017-08-11 22:23:15'], - dtype='datetime64[ns]'), - 'value': [1, 2, 3] - } - data_frame = pd.DataFrame(data_frame).set_index('time') - grouper = TimeGrouper('D') - grouped = data_frame.groupby(grouper) - data1 = grouped.count() - grouped = data_frame.groupby([grouper]) - data2 = grouped.count() - assert_frame_equal(data1, data2) From b868cbc14f7966c7aea9b34dd7acb9cfef93b604 Mon Sep 17 00:00:00 2001 From: inverse <534676033@qq.com> Date: Fri, 29 Sep 2017 18:29:20 +0800 Subject: [PATCH 09/10] update whatsnew entry --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1094e96bd0d20..3276310fa3e6e 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -704,6 +704,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) - Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) +- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse ^^^^^^ From 950de2031bad7a88c29bac9e86a73e1e7b2a495d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 1 Oct 2017 11:04:07 -0400 Subject: [PATCH 10/10] fixups --- pandas/core/groupby.py | 35 +++++++++++++++--------- pandas/core/resample.py | 11 ++------ pandas/tests/groupby/test_timegrouper.py | 10 ++++--- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index eb6da61cc756f..9379ade4be7a6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): """ Parameters ---------- obj : the subject object + validate : boolean, default True + if True, validate the grouper Returns ------- @@ -271,7 +273,8 @@ def _get_grouper(self, obj): self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], axis=self.axis, level=self.level, - sort=self.sort) + sort=self.sort, + validate=validate) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): @@ -1739,8 +1742,9 @@ class BaseGrouper(object): whether this grouper will give sorted result or not group_keys : boolean, default True mutated : boolean, default False - indexer : the indexer created by Grouper - some grouper (TimeGrouper eg) will sort its axis and its + indexer : intp array, optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its group_info is also sorted, so need the indexer to reorder """ @@ -2514,8 +2518,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): - # get the new grouper - _, grouper, _ = self.grouper._get_grouper(self.obj) + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj @@ -2587,12 +2594,12 @@ def ngroups(self): @cache_readonly def indices(self): - # for the situation of groupby list of groupers + # we have a list of groupers if isinstance(self.grouper, BaseGrouper): return self.grouper.indices - else: - values = _ensure_categorical(self.grouper) - return values._reverse_indexer() + + values = _ensure_categorical(self.grouper) + return values._reverse_indexer() @property def labels(self): @@ -2608,7 +2615,7 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: - # for the situation of groupby list of groupers + # we have a list of groupers if isinstance(self.grouper, BaseGrouper): labels = self.grouper.label_info uniques = self.grouper.result_index @@ -2626,7 +2633,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False): + mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -2643,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, are and then creates a Grouping for each one, combined into a BaseGrouper. + If validate, then check for key/level overlaps + """ group_axis = obj._get_axis(axis) @@ -2767,7 +2776,7 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if gpr in obj.index.names: + if validate and gpr in obj.index.names: warnings.warn( ("'%s' is both a column name and an index level.\n" "Defaulting to column but " diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 316830e75a88b..6edbb99641542 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -879,14 +879,7 @@ def _downsample(self, how, **kwargs): if is_subperiod(ax.freq, self.freq): # Downsampling - if len(new_index) == 0: - bins = [] - else: - i8 = memb.asi8 - rng = np.arange(i8[0], i8[-1] + 1) - bins = memb.searchsorted(rng, side='right') - grouper = BinGrouper(bins, new_index, indexer=self.groupby.indexer) - return self._groupby_and_aggregate(how, grouper=grouper) + return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): if how == 'ohlc': # GH #13083 @@ -1112,7 +1105,7 @@ def _get_resampler(self, obj, kind=None): "TimedeltaIndex or PeriodIndex, " "but got an instance of %r" % type(ax).__name__) - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 938baadaf80ab..c8503b16a0e16 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -634,9 +634,11 @@ def test_scalar_call_versus_list_call(self): 'value': [1, 2, 3] } data_frame = pd.DataFrame(data_frame).set_index('time') - grouper = pd.TimeGrouper('D') + grouper = pd.Grouper(freq='D') + grouped = data_frame.groupby(grouper) - data1 = grouped.count() + result = grouped.count() grouped = data_frame.groupby([grouper]) - data2 = grouped.count() - assert_frame_equal(data1, data2) + expected = grouped.count() + + assert_frame_equal(result, expected)