From fa532b69fb41c82fa430242d6f6a2eb27a52f424 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 1 Apr 2018 10:28:46 -0400 Subject: [PATCH 1/6] BUG: groupby with categorical and other columns closes #14942 --- doc/source/whatsnew/v0.23.0.txt | 35 ++ pandas/core/generic.py | 11 +- pandas/core/groupby/groupby.py | 73 ++- pandas/core/reshape/pivot.py | 25 +- pandas/tests/groupby/test_categorical.py | 693 +++++++++++++---------- pandas/tests/reshape/test_pivot.py | 60 +- 6 files changed, 549 insertions(+), 348 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 604b68b650201..8e77a7b18cdb0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -527,6 +527,41 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use 'Taxes': -200, 'Net result': 300}).sort_index() +.. _whatsnew_0230.api_breaking.categorical_grouping: + +Categorical Groupers will now require passing the observed keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for +each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward +compatible (generate a cartesian product). Pandas will show a ``FutureWarning`` if the ``observed`` keyword is not passed; the default will +change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`) + + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 + df + +Previous Behavior (show all values): + +.. ipython:: python + +.. code-block:: python + df.groupby(['A', 'B', 'C'], observed=False).count() + + +New Behavior (show only observed values): + +.. ipython:: python + + df.groupby(['A', 'B', 'C'], observed=True).count() + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/generic.py b/pandas/core/generic.py index af19acbb416ee..9f68d26208619 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6599,7 +6599,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): axis=axis, inplace=inplace) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + group_keys=True, squeeze=False, observed=None, **kwargs): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns. @@ -6632,6 +6632,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, squeeze : boolean, default False reduce the dimensionality of the return type if possible, otherwise return a consistent type + observed : boolean, default None + if True: only show observed values for categorical groupers + if False: show all values for categorical groupers + if None: if any categorical groupers, show a FutureWarning, + default to False + + .. versionadded:: 0.23.0 Returns ------- @@ -6665,7 +6672,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, axis = self._get_axis_number(axis) return groupby(self, by=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, - **kwargs) + observed=observed, **kwargs) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c20d62117e25..b6a759efb6cf3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, **kwargs): + sort=True, group_keys=True, squeeze=False, + observed=None, **kwargs): self._selection = selection @@ -576,6 +577,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.sort = sort self.group_keys = group_keys self.squeeze = squeeze + self.observed = observed self.mutated = kwargs.pop('mutated', False) if grouper is None: @@ -583,6 +585,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, axis=axis, level=level, sort=sort, + observed=observed, mutated=self.mutated) self.obj = obj @@ -2331,18 +2334,21 @@ def ngroups(self): def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, - obs_ids, self.shape, labels, xnull=True) + return decons_obs_group_ids( + comp_ids, obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): if not self.compressed and len(self.groupings) == 1: return self.groupings[0].group_index.rename(self.names[0]) - return MultiIndex(levels=[ping.group_index for ping in self.groupings], - labels=self.recons_labels, - verify_integrity=False, - names=self.names) + labels = self.recons_labels + levels = [ping.group_index for ping in self.groupings] + result = MultiIndex(levels=levels, + labels=labels, + verify_integrity=False, + names=self.names) + return result def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: @@ -2883,6 +2889,7 @@ class Grouping(object): obj : name : level : + observed : If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -2898,7 +2905,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, in_axis=False): + sort=True, observed=None, in_axis=False): self.name = name self.level = level @@ -2906,6 +2913,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.index = index self.sort = sort self.obj = obj + self.observed = observed self.in_axis = in_axis # right place for this? @@ -2954,16 +2962,34 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, elif is_categorical_dtype(self.grouper): self.grouper = self.grouper._codes_for_groupby(self.sort) + codes = self.grouper.codes + categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes - self._labels = self.grouper.codes + self._labels = codes + + # Use the observed values of the grouper if inidcated + observed = self.observed + if observed is None: + msg = ("pass observed=True to ensure that a " + "categorical grouper only returns the " + "observed groupers, or\n" + "observed=False to return NA for non-observed" + "values\n") + warnings.warn(msg, FutureWarning, stacklevel=5) + observed = False + + if observed: + codes = algorithms.unique1d(codes) + else: + codes = np.arange(len(categories)) - c = self.grouper.categories self._group_index = CategoricalIndex( - Categorical.from_codes(np.arange(len(c)), - categories=c, - ordered=self.grouper.ordered)) + Categorical.from_codes( + codes=codes, + categories=categories, + ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): @@ -3048,7 +3074,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False, validate=True): + observed=None, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -3065,6 +3091,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, are and then creates a Grouping for each one, combined into a BaseGrouper. + If observed & we have a categorical grouper, only show the observed + values + If validate, then check for key/level overlaps """ @@ -3243,6 +3272,7 @@ def is_in_obj(gpr): name=name, level=level, sort=sort, + observed=observed, in_axis=in_axis) \ if not isinstance(gpr, Grouping) else gpr @@ -4154,7 +4184,7 @@ def first_not_none(values): not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: - key_index = MultiIndex.from_tuples(keys, names=key_names) + key_index = self.grouper.result_index else: ping = self.grouper.groupings[0] @@ -4244,8 +4274,9 @@ def first_not_none(values): # normally use vstack as its faster than concat # and if we have mi-columns - if isinstance(v.index, - MultiIndex) or key_index is None: + if (isinstance(v.index, MultiIndex) or + key_index is None or + isinstance(key_index, MultiIndex)): stacked_values = np.vstack(map(np.asarray, values)) result = DataFrame(stacked_values, index=key_index, columns=index) @@ -4696,6 +4727,14 @@ def _reindex_output(self, result): This can re-expand the output space """ + + # TODO(jreback): remove completely + # when observed parameter is defaulted to True + # gh-20583 + + if self.observed: + return result + groupings = self.grouper.groupings if groupings is None: return result diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74a9b59d3194a..b9071d97f78a8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -79,7 +79,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys) + grouped = data.groupby(keys, observed=dropna) agged = grouped.agg(aggfunc) table = agged @@ -241,10 +241,13 @@ def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows).agg(aggfunc) + margin = data[rows + values].groupby( + rows, observed=True).agg(aggfunc) cat_axis = 1 - for key, piece in table.groupby(level=0, axis=cat_axis): + for key, piece in table.groupby(level=0, + axis=cat_axis, + observed=True): all_key = _all_key(key) # we are going to mutate this, so need to copy! @@ -264,7 +267,9 @@ def _all_key(key): else: margin = grand_margin cat_axis = 0 - for key, piece in table.groupby(level=0, axis=cat_axis): + for key, piece in table.groupby(level=0, + axis=cat_axis, + observed=True): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) @@ -279,7 +284,8 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = data[cols + values].groupby( + cols, observed=True).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -304,14 +310,17 @@ def _all_key(): return (margins_name, ) + ('', ) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows).apply(aggfunc) + margin = data[rows].groupby(rows, + observed=True).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, axis=0).apply(aggfunc) + margin = data.groupby(level=0, + axis=0, + observed=True).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -322,7 +331,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols).apply(aggfunc) + row_margin = data[cols].groupby(cols, observed=True).apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 160b60e69f39d..86c4f9c32c5f9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -5,16 +5,48 @@ import pytest import numpy as np -from numpy import nan - import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series, Interval, qcut) + DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm -def test_groupby(): +@pytest.fixture(params=[True, False]) +def observed(request): + return request.param + + +def cartesian_product_for_groupers(result, args, names): + """ Reindex to a cartesian production for the groupers, + preserving the nature (Categorical) of each grouper """ + + def f(a): + if isinstance(a, (CategoricalIndex, Categorical)): + categories = a.categories + a = Categorical.from_codes(np.arange(len(categories)), + categories=categories, + ordered=a.ordered) + return a + + index = pd.MultiIndex.from_product(map(f, args), names=names) + return result.reindex(index).sort_index() + + +def test_apply_use_categorical_name(df): + cats = qcut(df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = df.groupby(cats, observed=False).D.apply(get_stats) + assert result.index.names[0] == 'C' + + +def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) @@ -22,56 +54,29 @@ def test_groupby(): exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) - result = data.groupby("b").mean() + result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper - gb = df.groupby("A") + gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) - g = x.groupby(['person_id']) + g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) @@ -93,36 +98,48 @@ def f(x): df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - result = df.a.groupby(c).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df['a']) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(sum), + df[['a']]) tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), + df[['a']]) # Filter - tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) - tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + tm.assert_series_equal( + df.a.groupby(c, observed=False).filter(np.all), + df['a']) + tm.assert_frame_equal( + df.groupby(c, observed=False).filter(np.all), + df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - result = df.a.groupby(c).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df['a']) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(sum), + df[['a']]) tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) - result = df.groupby(c).apply(len) + result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex( c.values.categories, ordered=c.values.ordered) @@ -130,36 +147,56 @@ def f(x): expected.index.name = 'a' tm.assert_series_equal(result, expected) + # more basic + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) -def test_groupby_sort(): + cats = Categorical.from_codes(codes, levels, ordered=True) - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby - # This should result in a properly sorted Series so that the plot - # has a sorted x axis - # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + data = DataFrame(np.random.randn(100, 4)) - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) + result = data.groupby(cats, observed=False).mean() - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + expected = data.groupby(np.asarray(cats), observed=False).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) - res = df.groupby(['value_group'])['value_group'].count() - exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] - exp.index = CategoricalIndex(exp.index, name=exp.index.name) - tm.assert_series_equal(res, exp) + assert_frame_equal(result, expected) + + grouped = data.groupby(cats, observed=False) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby( + exp_cats, sort=False, observed=False).describe() + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) -def test_level_groupby_get_group(): +def test_level_get_group(observed): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( levels=[pd.CategoricalIndex(["a", "b"]), range(10)], labels=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) - g = df.groupby(level=["Index1"]) + g = df.groupby(level=["Index1"], observed=observed) # expected should equal test.loc[["a"]] # GH15166 @@ -173,94 +210,215 @@ def test_level_groupby_get_group(): assert_frame_equal(result, expected) -def test_apply_use_categorical_name(df): - cats = qcut(df.C, 4) +@pytest.mark.parametrize('ordered', [True, False]) +def test_apply(ordered): + # GH 10138 - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} + dense = Categorical(list('abc'), ordered=ordered) + + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense'], observed=True) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_arrays( + [missing, dense], names=['missing', 'dense']) + expected = DataFrame([0, 1, 2.], + index=idx, + columns=['values']) + + result = grouped.apply(lambda x: np.mean(x)) + assert_frame_equal(result, expected) - result = df.groupby(cats).D.apply(get_stats) - assert result.index.names[0] == 'C' + # we coerce back to ints + expected = expected.astype('int') + result = grouped.mean() + assert_frame_equal(result, expected) + result = grouped.agg(np.mean) + assert_frame_equal(result, expected) -def test_apply_categorical_data(): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - - -def test_groupby_categorical(): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) + # but for transform we should still get back the original index + idx = MultiIndex.from_arrays([missing, dense], + names=['missing', 'dense']) + expected = Series(1, index=idx) + result = grouped.apply(lambda x: 1) + assert_series_equal(result, expected) - cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.randn(100, 4)) +def test_observed_warning(): + # 20583 - future warning on observe - result = data.groupby(cats).mean() + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.groupby(['A', 'B', 'C']) - assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.groupby('A') - grouped = data.groupby(cats) - desc_result = grouped.describe() - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) +def test_observed(observed): + # multiple groupers, don't re-expand the output space + # of the grouper + # gh-14942 (implement) + # gh-10132 (back-compat) + # gh-8138 (back-compat) + # gh-8869 - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - assert_frame_equal(desc_result, expected) + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + # multiple groupers with a non-cat + gb = df.groupby(['A', 'B', 'C'], observed=observed) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2, ['foo', 'bar'] * 2], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + [1, 2, 3, 4], index=exp_index)}).sort_index() + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers( + expected, + [cat1, cat2, ['foo', 'bar']], + list('ABC')) + + tm.assert_frame_equal(result, expected) + + gb = df.groupby(['A', 'B'], observed=observed) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, 3, 4]}, + index=exp_index) + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers( + expected, + [cat1, cat2], + list('AB')) + + tm.assert_frame_equal(result, expected) + + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + df = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = df.groupby("cat", observed=observed) + result = groups_single_key.mean() + + exp_index = pd.CategoricalIndex(list('ab'), name="cat", + categories=list('abc'), + ordered=True) + expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, + index=exp_index) + if not observed: + index = pd.CategoricalIndex(list('abc'), name="cat", + categories=list('abc'), + ordered=True) + expected = expected.reindex(index) + + tm.assert_frame_equal(result, expected) + + # Grouping on two columns + groups_double_key = df.groupby(["cat", "ints"], observed=observed) + result = groups_double_key.agg('mean') + expected = DataFrame( + {"val": [10, 30, 20, 40], + "cat": pd.Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), + "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) + if not observed: + expected = cartesian_product_for_groupers( + expected, + [df.cat.values, [1, 2]], + ['cat', 'ints']) + + tm.assert_frame_equal(result, expected) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = df[(df.cat == c) & (df.ints == i)] + assert_frame_equal(result, expected) + + # gh-8869 + # with as_index + d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], + 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} + df = pd.DataFrame(d) + cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) + df['range'] = cat + groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) + result = groups.agg('mean') + + groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) + expected = groups2.agg('mean').reset_index() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="failing with observed") +def test_observed_failing(observed): + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + df = pd.DataFrame(d) + values = pd.cut(df['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = df.groupby([values, 'C2'], observed=observed) + + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], + names=["cat", "C2"]) + expected = DataFrame({"C1": [3, 3, 4, 5], + "C3": [10, 100, 200, 34]}, index=idx) + if not observed: + expected = cartesian_product_for_groupers( + expected, + [values.values, [1, 2, 3, 4]], + ['cat', 'C2']) + + result = groups_double_key.agg('mean') + tm.assert_frame_equal(result, expected) + + +def test_observed_perf(): + # we create a cartesian product, so this is + # non-performant if we don't use observed values + # gh-14942 + df = DataFrame({ + 'cat': np.random.randint(0, 255, size=30000), + 'int_id': np.random.randint(0, 255, size=30000), + 'other_id': np.random.randint(0, 10000, size=30000), + 'foo': 0}) + df['cat'] = df.cat.astype(str).astype('category') + + grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True) + result = grouped.count() + assert result.index.levels[0].nunique() == df.cat.nunique() + assert result.index.levels[1].nunique() == df.int_id.nunique() + assert result.index.levels[2].nunique() == df.other_id.nunique() -def test_groupby_datetime_categorical(): +def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) @@ -268,9 +426,9 @@ def test_groupby_datetime_categorical(): cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() + result = data.groupby(cats, observed=False).mean() - expected = data.groupby(np.asarray(cats)).mean() + expected = data.groupby(np.asarray(cats), observed=False).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, @@ -278,13 +436,13 @@ def test_groupby_datetime_categorical(): assert_frame_equal(result, expected) - grouped = data.groupby(cats) + grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() + expected = ord_data.groupby(ord_labels, observed=False).describe() assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( @@ -303,7 +461,7 @@ def test_groupby_datetime_categorical(): .get_level_values(1)), exp) -def test_groupby_categorical_index(): +def test_categorical_index(): s = np.random.RandomState(12345) levels = ['foo', 'bar', 'baz', 'qux'] @@ -315,23 +473,23 @@ def test_groupby_categorical_index(): df['cats'] = cats # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() + result = df.set_index('cats').groupby(level=0, observed=False).sum() + expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() + result = df.groupby('cats', observed=False).sum() + expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) -def test_groupby_describe_categorical_columns(): +def test_describe_categorical_columns(): # GH 11558 cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], categories=['foo', 'bar', 'baz', 'qux'], @@ -343,14 +501,15 @@ def test_groupby_describe_categorical_columns(): tm.assert_categorical_equal(result.stack().columns.values, cats.values) -def test_groupby_unstack_categorical(): +def test_unstack_categorical(): # GH11558 (example is taken from the original issue) df = pd.DataFrame({'a': range(10), 'medium': ['A', 'B'] * 5, 'artist': list('XYXXY') * 2}) df['medium'] = df['medium'].astype('category') - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + gcat = df.groupby( + ['artist', 'medium'], observed=False)['a'].count().unstack() result = gcat.describe() exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, @@ -363,7 +522,7 @@ def test_groupby_unstack_categorical(): tm.assert_series_equal(result, expected) -def test_groupby_bins_unequal_len(): +def test_bins_unequal_len(): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) bins = pd.cut(series.dropna().values, 4) @@ -374,47 +533,45 @@ def f(): pytest.raises(ValueError, f) -def test_groupby_multi_categorical_as_index(): +def test_as_index(): # GH13204 df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), 'A': [10, 11, 11], 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum() + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # function grouper f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', f], as_index=False, observed=True).sum() + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 22], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # another not in-axis grouper s = Series(['a', 'b', 'b'], name='cat2') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) # GH18872: conflicting names in desired index - pytest.raises(ValueError, lambda: df.groupby(['cat', - s.rename('cat')]).sum()) + with pytest.raises(ValueError): + df.groupby(['cat', s.rename('cat')], observed=True).sum() # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - group_columns = ['cat', 'A'] + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) for name in [None, 'X', 'B', 'cat']: df.index = Index(list("abc"), name=name) @@ -422,15 +579,17 @@ def test_groupby_multi_categorical_as_index(): if name in group_columns and name in df.index.names: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.groupby(group_columns, as_index=False).sum() + result = df.groupby( + group_columns, as_index=False, observed=True).sum() else: - result = df.groupby(group_columns, as_index=False).sum() + result = df.groupby( + group_columns, as_index=False, observed=True).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) + tm.assert_frame_equal(result, expected) -def test_groupby_preserve_categories(): +def test_preserve_categories(): # GH-13179 categories = list('abc') @@ -439,8 +598,10 @@ def test_groupby_preserve_categories(): categories=categories, ordered=True)}) index = pd.CategoricalIndex(categories, categories, ordered=True) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + tm.assert_index_equal( + df.groupby('A', sort=True, observed=False).first().index, index) + tm.assert_index_equal( + df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False df = DataFrame({'A': pd.Categorical(list('ba'), @@ -449,13 +610,15 @@ def test_groupby_preserve_categories(): sort_index = pd.CategoricalIndex(categories, categories, ordered=False) nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), ordered=False) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, - sort_index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, - nosort_index) + tm.assert_index_equal( + df.groupby('A', sort=True, observed=False).first().index, + sort_index) + tm.assert_index_equal( + df.groupby('A', sort=False, observed=False).first().index, + nosort_index) -def test_groupby_preserve_categorical_dtype(): +def test_preserve_categorical_dtype(): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], 'B': [10, 16, 22, 28, 34], @@ -475,38 +638,22 @@ def test_groupby_preserve_categorical_dtype(): categories=list("bac"), ordered=True)}) for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = df.groupby( + by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) -def test_groupby_categorical_no_compress(): +def test_categorical_no_compress(): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) @@ -515,8 +662,8 @@ def test_groupby_categorical_no_compress(): codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) @@ -525,13 +672,34 @@ def test_groupby_categorical_no_compress(): categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - result = data.groupby("b").mean() + result = data.groupby("b", observed=False).mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) tm.assert_numpy_array_equal(result, exp) -def test_groupby_sort_categorical(): +def test_sort(): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) + + res = df.groupby(['value_group'], observed=False)['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + + +def test_sort2(): # dataframe groupby sort was being ignored # GH 8868 df = DataFrame([['(7.5, 10]', 10, 10], ['(7.5, 10]', 8, 20], @@ -547,9 +715,12 @@ def test_groupby_sort_categorical(): columns=['foo', 'bar'], index=index) col = 'range' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=False, observed=False).first()) df['range'] = Categorical(df['range'], ordered=False) index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', @@ -567,11 +738,13 @@ def test_groupby_sort_categorical(): col = 'range' # this is an unordered categorical, but we allow this #### - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + assert_frame_equal( + result_nosort, df.groupby(col, sort=False, observed=False).first()) -def test_groupby_sort_categorical_datetimelike(): +def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is @@ -600,9 +773,12 @@ def test_groupby_sort_categorical_datetimelike(): name='dt', ordered=True) col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) @@ -620,65 +796,10 @@ def test_groupby_sort_categorical_datetimelike(): name='dt') col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - -def test_groupby_categorical_two_columns(): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical([Interval(1, 2), Interval(2, 3), - Interval(3, 6)], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + assert_frame_equal( + result_nosort, df.groupby(col, sort=False, observed=False).first()) def test_empty_sum(): @@ -689,22 +810,22 @@ def test_empty_sum(): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') # 0 by default - result = df.groupby("A").B.sum() + result = df.groupby("A", observed=False).B.sum() expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 - result = df.groupby("A").B.sum(min_count=0) + result = df.groupby("A", observed=False).B.sum(min_count=0) expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 - result = df.groupby("A").B.sum(min_count=1) + result = df.groupby("A", observed=False).B.sum(min_count=1) expected = pd.Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count>1 - result = df.groupby("A").B.sum(min_count=2) + result = df.groupby("A", observed=False).B.sum(min_count=2) expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) @@ -718,16 +839,16 @@ def test_empty_prod(): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') # 1 by default - result = df.groupby("A").B.prod() + result = df.groupby("A", observed=False).B.prod() expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 - result = df.groupby("A").B.prod(min_count=0) + result = df.groupby("A", observed=False).B.prod(min_count=0) expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 - result = df.groupby("A").B.prod(min_count=1) + result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1004b40bfb4c1..9f3faa80d3750 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -93,23 +93,23 @@ def test_pivot_table_dropna(self): def test_pivot_table_categorical(self): - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2], names=['A', 'B']) expected = DataFrame( - {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, + {'values': [1, 2, 3, 4]}, index=exp_index) tm.assert_frame_equal(result, expected) - def test_pivot_table_dropna_categoricals(self): + @pytest.mark.parametrize('dropna', [True, False]) + def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -118,30 +118,23 @@ def test_pivot_table_dropna_categoricals(self): 'C': range(0, 9)}) df['A'] = df['A'].astype(CDT(categories, ordered=False)) - result_true = df.pivot_table(index='B', columns='A', values='C', - dropna=True) + result = df.pivot_table(index='B', columns='A', values='C', + dropna=dropna) expected_columns = Series(['a', 'b', 'c'], name='A') expected_columns = expected_columns.astype( CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name='B') - expected_true = DataFrame([[0.0, 3.0, 6.0], - [1.0, 4.0, 7.0], - [2.0, 5.0, 8.0]], - index=expected_index, - columns=expected_columns,) - tm.assert_frame_equal(expected_true, result_true) - - result_false = df.pivot_table(index='B', columns='A', values='C', - dropna=False) - expected_columns = ( - Series(['a', 'b', 'c', 'd'], name='A').astype('category') - ) - expected_false = DataFrame([[0.0, 3.0, 6.0, np.NaN], - [1.0, 4.0, 7.0, np.NaN], - [2.0, 5.0, 8.0, np.NaN]], - index=expected_index, - columns=expected_columns,) - tm.assert_frame_equal(expected_false, result_false) + expected = DataFrame([[0, 3, 6], + [1, 4, 7], + [2, 5, 8]], + index=expected_index, + columns=expected_columns,) + if not dropna: + # add back the non observed to compare + expected = expected.reindex( + columns=Categorical(categories)).astype('float') + + tm.assert_frame_equal(result, expected) def test_pass_array(self): result = self.data.pivot_table( @@ -1132,14 +1125,11 @@ def test_categorical_pivot_index_ordering(self): columns='Year', aggfunc='sum') expected_columns = pd.Int64Index([2013, 2014], name='Year') - expected_index = pd.CategoricalIndex(months, + expected_index = pd.CategoricalIndex(['January'], categories=months, ordered=False, name='Month') - expected_data = np.empty((12, 2)) - expected_data.fill(np.nan) - expected_data[0, :] = [320., 120.] - expected = pd.DataFrame(expected_data, + expected = pd.DataFrame([[320, 120]], index=expected_index, columns=expected_columns) tm.assert_frame_equal(result, expected) From 144a63d7daa741d35af1deefd562cfd4c051d169 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 22 Apr 2018 11:17:01 -0400 Subject: [PATCH 2/6] review comments --- doc/source/groupby.rst | 74 +++++++++++++------ doc/source/whatsnew/v0.23.0.txt | 33 ++++++++- pandas/conftest.py | 9 +++ pandas/core/arrays/categorical.py | 22 +++++- pandas/core/groupby/groupby.py | 27 +++---- pandas/core/indexes/category.py | 4 +- pandas/core/reshape/pivot.py | 23 +++--- pandas/tests/frame/test_sorting.py | 2 +- pandas/tests/groupby/aggregate/test_cython.py | 23 ++++-- pandas/tests/groupby/aggregate/test_other.py | 6 +- pandas/tests/groupby/test_categorical.py | 10 +-- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_grouping.py | 25 +++++-- pandas/tests/reshape/test_pivot.py | 22 ++++-- 14 files changed, 197 insertions(+), 89 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 407fad39ba232..efd7adf90c494 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -91,10 +91,10 @@ The mapping can be specified many different ways: - A Python function, to be called on each of the axis labels. - A list or NumPy array of the same length as the selected axis. - A dict or ``Series``, providing a ``label -> group name`` mapping. - - For ``DataFrame`` objects, a string indicating a column to be used to group. + - For ``DataFrame`` objects, a string indicating a column to be used to group. Of course ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``, but it makes life simpler. - - For ``DataFrame`` objects, a string indicating an index level to be used to + - For ``DataFrame`` objects, a string indicating an index level to be used to group. - A list of any of the above things. @@ -120,7 +120,7 @@ consider the following ``DataFrame``: 'D' : np.random.randn(8)}) df -On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. +On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python @@ -360,8 +360,8 @@ Index level names may be specified as keys directly to ``groupby``. DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once you have created the GroupBy object from a DataFrame, you might want to do -something different for each of the columns. Thus, using ``[]`` similar to +Once you have created the GroupBy object from a DataFrame, you might want to do +something different for each of the columns. Thus, using ``[]`` similar to getting a column from a DataFrame, you can do: .. ipython:: python @@ -421,7 +421,7 @@ statement if you wish: ``for (k1, k2), group in grouped:``. Selecting a group ----------------- -A single group can be selected using +A single group can be selected using :meth:`~pandas.core.groupby.DataFrameGroupBy.get_group`: .. ipython:: python @@ -444,8 +444,8 @@ perform a computation on the grouped data. These operations are similar to the :ref:`aggregating API `, :ref:`window functions API `, and :ref:`resample API `. -An obvious one is aggregation via the -:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently +An obvious one is aggregation via the +:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` method: .. ipython:: python @@ -517,12 +517,12 @@ Some common aggregating functions are tabulated below: :meth:`~pd.core.groupby.DataFrameGroupBy.nth`;Take nth value, or a subset if n is a list :meth:`~pd.core.groupby.DataFrameGroupBy.min`;Compute min of group values :meth:`~pd.core.groupby.DataFrameGroupBy.max`;Compute max of group values - -The aggregating functions above will exclude NA values. Any function which + +The aggregating functions above will exclude NA values. Any function which reduces a :class:`Series` to a scalar value is an aggregation function and will work, a trivial example is ``df.groupby('A').agg(lambda ser: 1)``. Note that -:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a +:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a filter, see :ref:`here `. .. _groupby.aggregate.multifunc: @@ -732,7 +732,7 @@ and that the transformed data contains no NAs. .. note:: Some functions will automatically transform the input when applied to a - GroupBy object, but returning an object of the same shape as the original. + GroupBy object, but returning an object of the same shape as the original. Passing ``as_index=False`` will not affect these transformation methods. For example: ``fillna, ffill, bfill, shift.``. @@ -926,7 +926,7 @@ The dimension of the returned result can also change: In [11]: grouped.apply(f) -``apply`` on a Series can operate on a returned value from the applied function, +``apply`` on a Series can operate on a returned value from the applied function, that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python @@ -984,20 +984,48 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -Note that ``df.groupby('A').colname.std().`` is more efficient than +Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function -is only interesting over one column (here ``colname``), it may be filtered +is only interesting over one column (here ``colname``), it may be filtered *before* applying the aggregation function. +.. _groupby.observed: + +observed hanlding +~~~~~~~~~~~~~~~~~ + +When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword +controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those +that are observed groupers (``observed=True``). The ``observed`` keyword will default to ``True`` in the future. + +Show only the observed values: + +.. ipython:: python + + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count() + +Show all values: + +.. ipython:: python + + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + +The returned dtype of the grouped will *always* include *all* of the catergories that were grouped. + +.. ipython:: python + + s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + s.index.dtype + .. _groupby.missing: NA and NaT group handling ~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN or NaT values in the grouping key, these will be -automatically excluded. In other words, there will never be an "NA group" or -"NaT group". This was not the case in older versions of pandas, but users were -generally discarding the NA group anyway (and supporting it was an +If there are any NaN or NaT values in the grouping key, these will be +automatically excluded. In other words, there will never be an "NA group" or +"NaT group". This was not the case in older versions of pandas, but users were +generally discarding the NA group anyway (and supporting it was an implementation headache). Grouping with ordered factors @@ -1084,8 +1112,8 @@ This shows the first or last n rows from each group. Taking the nth row of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To select from a DataFrame or Series the nth item, use -:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and +To select from a DataFrame or Series the nth item, use +:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and will return a single row (or no row) per group if you pass an int for n: .. ipython:: python @@ -1153,7 +1181,7 @@ Enumerate groups .. versionadded:: 0.20.2 To see the ordering of the groups (as opposed to the order of rows -within a group given by ``cumcount``) you can use +within a group given by ``cumcount``) you can use :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`. @@ -1273,7 +1301,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on Multi-column factorization ~~~~~~~~~~~~~~~~~~~~~~~~~~ -By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract +By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract information about the groups in a way similar to :func:`factorize` (as described further in the :ref:`reshaping API `) but which applies naturally to multiple columns of mixed type and different diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 8e77a7b18cdb0..1fca187d3fbbf 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -548,20 +548,47 @@ change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issu df['C'] = ['foo', 'bar'] * 2 df -Previous Behavior (show all values): +``observed`` must now be passed when grouping by categoricals, or a +``FutureWarning`` will show: + +.. ipython:: python + :okwarning: + + df.groupby(['A', 'B', 'C']).count() + + +To suppress the warning, with previous Behavior (show all values): .. ipython:: python -.. code-block:: python df.groupby(['A', 'B', 'C'], observed=False).count() -New Behavior (show only observed values): +Future Behavior (show only observed values): .. ipython:: python df.groupby(['A', 'B', 'C'], observed=True).count() +For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df + +.. ipython:: python + + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=True) + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=False) + + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/conftest.py b/pandas/conftest.py index 559b5e44631b6..cf83904e4fa13 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,6 +66,15 @@ def ip(): return InteractiveShell() +@pytest.fixture(params=[True, False]) +def observed(request): + """ pass in the observed keyword to groupby for [True, False] + This indicates whether categoricals should return values for + values which are not in the grouper [False], or only values which + appear in the grouper [True] """ + return request.param + + @pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', pytest.param('xz', marks=td.skip_if_no_lzma)]) def compression(request): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 517c21cc1bc3a..b7808aae16169 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -647,7 +647,7 @@ def _set_categories(self, categories, fastpath=False): self._dtype = new_dtype - def _codes_for_groupby(self, sort): + def _codes_for_groupby(self, sort, observed): """ If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in @@ -661,6 +661,8 @@ def _codes_for_groupby(self, sort): ---------- sort : boolean The value of the sort parameter groupby was called with. + observed : boolean + Account only for the observed values Returns ------- @@ -671,6 +673,22 @@ def _codes_for_groupby(self, sort): categories in the original order. """ + # we only care about observed values + if observed: + unique_codes = unique1d(self.codes) + cat = self.copy() + + take_codes = unique_codes[unique_codes != -1] + if self.ordered: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + cat._categories = self.categories.take(take_codes) + cat._codes = _recode_for_categories(self.codes, + self.categories, + cat._categories) + return cat + # Already sorted according to self.categories; all is fine if sort: return self @@ -2161,7 +2179,7 @@ def unique(self): # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] if self.ordered: - take_codes = sorted(take_codes) + take_codes = np.sort(take_codes) return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b6a759efb6cf3..17d98f888e8ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1664,10 +1664,11 @@ def nth(self, n, dropna=None): if dropna not in ['any', 'all']: if isinstance(self._selected_obj, Series) and dropna is True: - warnings.warn("the dropna='%s' keyword is deprecated," + warnings.warn("the dropna={dropna} keyword is deprecated," "use dropna='all' instead. " "For a Series groupby, dropna must be " - "either None, 'any' or 'all'." % (dropna), + "either None, 'any' or 'all'.".format( + dropna=dropna), FutureWarning, stacklevel=2) dropna = 'all' @@ -2961,27 +2962,27 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - self.grouper = self.grouper._codes_for_groupby(self.sort) - codes = self.grouper.codes - categories = self.grouper.categories - - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes - self._labels = codes - # Use the observed values of the grouper if inidcated observed = self.observed if observed is None: msg = ("pass observed=True to ensure that a " "categorical grouper only returns the " "observed groupers, or\n" - "observed=False to return NA for non-observed" - "values\n") + "observed=False to include" + "unobserved categories.\n") warnings.warn(msg, FutureWarning, stacklevel=5) observed = False + grouper = self.grouper + self.grouper = self.grouper._codes_for_groupby( + self.sort, observed) + categories = self.grouper.categories + + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + self._labels = self.grouper.codes if observed: - codes = algorithms.unique1d(codes) + codes = algorithms.unique1d(grouper.codes) else: codes = np.arange(len(categories)) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 71caa098c7a28..3ffef5804acf7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _codes_for_groupby(self, sort): + def _codes_for_groupby(self, sort, observed): """ Return a Categorical adjusted for groupby """ - return self.values._codes_for_groupby(sort) + return self.values._codes_for_groupby(sort, observed) @classmethod def _add_comparison_methods(cls): diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b9071d97f78a8..39fb57e68c9c0 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -120,6 +120,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, + observed=dropna, margins_name=margins_name, fill_value=fill_value) # discard the top level @@ -138,7 +139,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', def _add_margins(table, data, values, rows, cols, aggfunc, - margins_name='All', fill_value=None): + observed=None, margins_name='All', fill_value=None): if not isinstance(margins_name, compat.string_types): raise ValueError('margins_name argument must be a string') @@ -168,6 +169,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, if values: marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, + observed, grand_margin, margins_name) if not isinstance(marginal_result_set, tuple): @@ -175,7 +177,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, margins_name) + table, data, rows, cols, aggfunc, observed, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -230,6 +232,7 @@ def _compute_grand_margin(data, values, aggfunc, def _generate_marginal_results(table, data, values, rows, cols, aggfunc, + observed, grand_margin, margins_name='All'): if len(cols) > 0: @@ -242,12 +245,12 @@ def _all_key(key): if len(rows) > 0: margin = data[rows + values].groupby( - rows, observed=True).agg(aggfunc) + rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, - observed=True): + observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! @@ -269,7 +272,7 @@ def _all_key(key): cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, - observed=True): + observed=observed): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) @@ -285,7 +288,7 @@ def _all_key(key): if len(cols) > 0: row_margin = data[cols + values].groupby( - cols, observed=True).agg(aggfunc) + cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -299,7 +302,7 @@ def _all_key(key): def _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, - margins_name='All'): + observed, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins margin_keys = [] @@ -311,7 +314,7 @@ def _all_key(): if len(rows) > 0: margin = data[rows].groupby(rows, - observed=True).apply(aggfunc) + observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -320,7 +323,7 @@ def _all_key(): else: margin = data.groupby(level=0, axis=0, - observed=True).apply(aggfunc) + observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -331,7 +334,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=True).apply(aggfunc) + row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 5bd239f8a3034..b60eb89e87da5 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self): bins=[-3, -0.5, 0, 0.5, 3]) model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) - result = model.groupby(['X1', 'X2']).mean().unstack() + result = model.groupby(['X1', 'X2'], observed=True).mean().unstack() expected = IntervalIndex.from_tuples( [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 80383c895a5e5..48a45e93e1e8e 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop): ('min', np.min), ('max', np.max), ] ) -def test_cython_agg_empty_buckets(op, targop): +def test_cython_agg_empty_buckets(op, targop, observed): df = pd.DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + g = df.groupby(pd.cut(df[0], grps), observed=observed) + result = g._cython_agg_general(op) + + g = df.groupby(pd.cut(df[0], grps), observed=observed) + expected = g.agg(lambda x: targop(x)) tm.assert_frame_equal(result, expected) -def test_cython_agg_empty_buckets_nanops(): +def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = pd.DataFrame([11, 12, 13], columns=['a']) grps = range(0, 25, 5) # add / sum - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + result = df.groupby(pd.cut(df['a'], grps), + observed=observed)._cython_agg_general('add') intervals = pd.interval_range(0, 20, freq=5) expected = pd.DataFrame( {"a": [0, 0, 36, 0]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + if observed: + expected = expected[expected.a != 0] + tm.assert_frame_equal(result, expected) # prod - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + result = df.groupby(pd.cut(df['a'], grps), + observed=observed)._cython_agg_general('prod') expected = pd.DataFrame( {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + if observed: + expected = expected[expected.a != 1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index a10f7f6e46210..34489051efc18 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -488,15 +488,17 @@ def test_agg_structs_series(structure, expected): @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") -def test_agg_category_nansum(): +def test_agg_category_nansum(observed): categories = ['a', 'b', 'c'] df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], categories=categories), 'B': [1, 2, 3]}) - result = df.groupby("A").B.agg(np.nansum) + result = df.groupby("A", observed=observed).B.agg(np.nansum) expected = pd.Series([3, 3, 0], index=pd.CategoricalIndex(['a', 'b', 'c'], categories=categories, name='A'), name='B') + if observed: + expected = expected[expected != 0] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 86c4f9c32c5f9..b1b65456bc03f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -12,11 +12,6 @@ import pandas.util.testing as tm -@pytest.fixture(params=[True, False]) -def observed(request): - return request.param - - def cartesian_product_for_groupers(result, args, names): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -378,8 +373,7 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="failing with observed") -def test_observed_failing(observed): +def test_observed_codes_remap(observed): d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} df = pd.DataFrame(d) values = pd.cut(df['C1'], [1, 2, 3, 6]) @@ -680,7 +674,7 @@ def test_categorical_no_compress(): def test_sort(): - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8 # This should result in a properly sorted Series so that the plot # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ba1371fe9f931..f1d678db4ff7f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -313,14 +313,14 @@ def test_cython_median(): tm.assert_frame_equal(rs, xp) -def test_median_empty_bins(): +def test_median_empty_bins(observed): df = pd.DataFrame(np.random.randint(0, 44, 500)) grps = range(0, 55, 5) bins = pd.cut(df[0], grps) - result = df.groupby(bins).median() - expected = df.groupby(bins).agg(lambda x: x.median()) + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 743237f5b386c..c0f5c43b2fd35 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self): by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) - def test_groupby_categorical_index_and_columns(self): + def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] @@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self): categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) - result = df.groupby(axis=1, level=0).sum() + result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) - expected_columns = CategoricalIndex(categories, - categories=categories, - ordered=True) + + if observed: + # if we are not-observed we undergo a reindex + # so need to adjust the output as our expected sets us up + # to be non-observed + expected_columns = CategoricalIndex(['A', 'B'], + categories=categories, + ordered=True) + else: + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) - result = df.groupby(axis=0, level=0).sum() + result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected) @@ -572,11 +581,11 @@ def test_get_group(self): pytest.raises(ValueError, lambda: g.get_group(('foo', 'bar', 'baz'))) - def test_get_group_empty_bins(self): + def test_get_group_empty_bins(self, observed): d = pd.DataFrame([3, 1, 7, 6]) bins = [0, 5, 10, 15] - g = d.groupby(pd.cut(d[0], bins)) + g = d.groupby(pd.cut(d[0], bins), observed=observed) # TODO: should prob allow a str of Interval work as well # IOW '(0, 5]' diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9f3faa80d3750..7a3387cf2318f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -98,7 +98,8 @@ def test_pivot_table_categorical(self): cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values='values', index=['A', 'B']) + result = pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=True) exp_index = pd.MultiIndex.from_arrays( [cat1, cat2], @@ -1061,7 +1062,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to ' 'ints)') - def test_categorical_margins(self): + def test_categorical_margins(self, observed): # GH 10989 df = pd.DataFrame({'x': np.arange(8), 'y': np.arange(8) // 4, @@ -1071,12 +1072,12 @@ def test_categorical_margins(self): expected.index = Index([0, 1, 'All'], name='y') expected.columns = Index([0, 1, 'All'], name='z') - table = df.pivot_table('x', 'y', 'z', margins=True) + table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) tm.assert_frame_equal(table, expected) @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to ' 'ints)') - def test_categorical_margins_category(self): + def test_categorical_margins_category(self, observed): df = pd.DataFrame({'x': np.arange(8), 'y': np.arange(8) // 4, 'z': np.arange(8) % 2}) @@ -1087,16 +1088,17 @@ def test_categorical_margins_category(self): df.y = df.y.astype('category') df.z = df.z.astype('category') - table = df.pivot_table('x', 'y', 'z', margins=True) + table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - def test_categorical_aggfunc(self): + def test_categorical_aggfunc(self, observed): # GH 9534 df = pd.DataFrame({"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]}) df["C1"] = df["C1"].astype("category") - result = df.pivot_table("V", index="C1", columns="C2", aggfunc="count") + result = df.pivot_table("V", index="C1", columns="C2", + dropna=observed, aggfunc="count") expected_index = pd.CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], @@ -1111,7 +1113,7 @@ def test_categorical_aggfunc(self): columns=expected_columns) tm.assert_frame_equal(result, expected) - def test_categorical_pivot_index_ordering(self): + def test_categorical_pivot_index_ordering(self, observed): # GH 8731 df = pd.DataFrame({'Sales': [100, 120, 220], 'Month': ['January', 'January', 'January'], @@ -1123,6 +1125,7 @@ def test_categorical_pivot_index_ordering(self): result = df.pivot_table(values='Sales', index='Month', columns='Year', + dropna=observed, aggfunc='sum') expected_columns = pd.Int64Index([2013, 2014], name='Year') expected_index = pd.CategoricalIndex(['January'], @@ -1132,6 +1135,9 @@ def test_categorical_pivot_index_ordering(self): expected = pd.DataFrame([[320, 120]], index=expected_index, columns=expected_columns) + if not observed: + result = result.dropna().astype(int) + tm.assert_frame_equal(result, expected) def test_pivot_table_not_series(self): From 19c9cf7871847de8f0a8504e9f121ad1460512d0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 27 Apr 2018 06:25:52 -0400 Subject: [PATCH 3/6] use correct cast --- pandas/core/arrays/categorical.py | 14 +++++++---- pandas/core/groupby/groupby.py | 29 +++++++++++++++++----- pandas/tests/groupby/test_categorical.py | 31 +++++++++++++----------- pandas/tests/reshape/test_pivot.py | 2 +- 4 files changed, 50 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b7808aae16169..c3be6e8dedfee 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -683,11 +683,15 @@ def _codes_for_groupby(self, sort, observed): take_codes = np.sort(take_codes) # we recode according to the uniques - cat._categories = self.categories.take(take_codes) - cat._codes = _recode_for_categories(self.codes, - self.categories, - cat._categories) - return cat + categories = self.categories.take(take_codes) + codes = _recode_for_categories(self.codes, + self.categories, + categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=self.ordered) + return type(self)(codes, dtype=dtype, fastpath=True) # Already sorted according to self.categories; all is fine if sort: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 17d98f888e8ea..209fcfa26edcf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2341,10 +2341,10 @@ def recons_labels(self): @cache_readonly def result_index(self): if not self.compressed and len(self.groupings) == 1: - return self.groupings[0].group_index.rename(self.names[0]) + return self.groupings[0].result_index.rename(self.names[0]) labels = self.recons_labels - levels = [ping.group_index for ping in self.groupings] + levels = [ping.result_index for ping in self.groupings] result = MultiIndex(levels=levels, labels=labels, verify_integrity=False, @@ -2353,12 +2353,12 @@ def result_index(self): def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: - return [self.groupings[0].group_index] + return [self.groupings[0].result_index] name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): labels = _ensure_platform_int(labels) - levels = ping.group_index.take(labels) + levels = ping.result_index.take(labels) name_list.append(levels) @@ -2911,6 +2911,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) + self.all_grouper = None self.index = index self.sort = sort self.obj = obj @@ -2973,7 +2974,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, warnings.warn(msg, FutureWarning, stacklevel=5) observed = False - grouper = self.grouper + self.all_grouper = self.grouper self.grouper = self.grouper._codes_for_groupby( self.sort, observed) categories = self.grouper.categories @@ -2982,7 +2983,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: - codes = algorithms.unique1d(grouper.codes) + codes = algorithms.unique1d(self.grouper.codes) else: codes = np.arange(len(categories)) @@ -3049,6 +3050,22 @@ def labels(self): self._make_labels() return self._labels + @cache_readonly + def result_index(self): + if self.all_grouper is not None: + all_categories = self.all_grouper.categories + + # we re-order to the original category orderings + if self.sort: + return self.group_index.set_categories(all_categories) + + # we are not sorting, so add unobserved to the end + categories = self.group_index.categories + return self.group_index.add_categories( + all_categories[~all_categories.isin(categories)]) + + return self.group_index + @property def group_index(self): if self._group_index is None: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b1b65456bc03f..ffd1d04e8b18f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -705,37 +705,40 @@ def test_sort2(): df['range'] = Categorical(df['range'], ordered=True) index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range', ordered=True) - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) + expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) col = 'range' - assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) + result_sort = df.groupby(col, sort=True, observed=False).first() + assert_frame_equal(result_sort, expected_sort) # when categories is ordered, group is ordered by category's order - assert_frame_equal( - result_sort, df.groupby(col, sort=False, observed=False).first()) + expected_sort = result_sort + result_sort = df.groupby(col, sort=False, observed=False).first() + assert_frame_equal(result_sort, expected_sort) df['range'] = Categorical(df['range'], ordered=False) index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) + expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], categories=['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], name='range') - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) + expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + index=index, columns=['foo', 'bar']) col = 'range' + # this is an unordered categorical, but we allow this #### - assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) - assert_frame_equal( - result_nosort, df.groupby(col, sort=False, observed=False).first()) + result_sort = df.groupby(col, sort=True, observed=False).first() + assert_frame_equal(result_sort, expected_sort) + + result_nosort = df.groupby(col, sort=False, observed=False).first() + assert_frame_equal(result_nosort, expected_nosort) def test_sort_datetimelike(): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7a3387cf2318f..76cdc1d2a195d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1136,7 +1136,7 @@ def test_categorical_pivot_index_ordering(self, observed): index=expected_index, columns=expected_columns) if not observed: - result = result.dropna().astype(int) + result = result.dropna().astype(np.int64) tm.assert_frame_equal(result, expected) From 7ae10badd411f0628598a573ce58de0f87c85a66 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 Apr 2018 19:55:40 -0400 Subject: [PATCH 4/6] doc updates --- doc/source/groupby.rst | 4 ++-- pandas/core/arrays/categorical.py | 5 +++++ pandas/core/generic.py | 6 +++--- pandas/core/groupby/groupby.py | 4 ++-- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index efd7adf90c494..46a6ab5b3b140 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -991,8 +991,8 @@ is only interesting over one column (here ``colname``), it may be filtered .. _groupby.observed: -observed hanlding -~~~~~~~~~~~~~~~~~ +Handling of (un)observed Categorical values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c3be6e8dedfee..f91782459df67 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -649,6 +649,11 @@ def _set_categories(self, categories, fastpath=False): def _codes_for_groupby(self, sort, observed): """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f68d26208619..e68662037b43d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6633,10 +6633,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, reduce the dimensionality of the return type if possible, otherwise return a consistent type observed : boolean, default None - if True: only show observed values for categorical groupers - if False: show all values for categorical groupers + if True: only show observed values for categorical groupers. + if False: show all values for categorical groupers. if None: if any categorical groupers, show a FutureWarning, - default to False + default to False. .. versionadded:: 0.23.0 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 209fcfa26edcf..73dc933879942 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2968,8 +2968,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed is None: msg = ("pass observed=True to ensure that a " "categorical grouper only returns the " - "observed groupers, or\n" - "observed=False to include" + "observed categories, or\n" + "observed=False to also include" "unobserved categories.\n") warnings.warn(msg, FutureWarning, stacklevel=5) observed = False From bdb7ad3db6623c16301570667c7a11936caa290c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 Apr 2018 20:00:24 -0400 Subject: [PATCH 5/6] make observed=False the default, remove deprecation warning --- doc/source/groupby.rst | 10 +- doc/source/whatsnew/v0.23.0.txt | 114 +++++++++++------------ pandas/core/groupby/groupby.py | 20 +--- pandas/tests/groupby/test_categorical.py | 17 ---- 4 files changed, 62 insertions(+), 99 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 46a6ab5b3b140..3616a7e1b41d2 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -996,19 +996,19 @@ Handling of (un)observed Categorical values When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those -that are observed groupers (``observed=True``). The ``observed`` keyword will default to ``True`` in the future. +that are observed groupers (``observed=True``). -Show only the observed values: +Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count() + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() -Show all values: +Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count() The returned dtype of the grouped will *always* include *all* of the catergories that were grouped. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1fca187d3fbbf..5af703822829b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -396,6 +396,58 @@ documentation. If you build an extension array, publicize it on our .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest/ +.. _whatsnew_0230.enhancements.categorical_grouping: + +Categorical Groupers has gained an observed keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for +each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward +compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`) + + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 + df + +To show all values, the previous behavior: + +.. ipython:: python + + df.groupby(['A', 'B', 'C'], observed=False).count() + + +To show only observed values: + +.. ipython:: python + + df.groupby(['A', 'B', 'C'], observed=True).count() + +For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df + +.. ipython:: python + + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=True) + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=False) + + .. _whatsnew_0230.enhancements.other: Other Enhancements @@ -527,68 +579,6 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use 'Taxes': -200, 'Net result': 300}).sort_index() -.. _whatsnew_0230.api_breaking.categorical_grouping: - -Categorical Groupers will now require passing the observed keyword -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for -each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward -compatible (generate a cartesian product). Pandas will show a ``FutureWarning`` if the ``observed`` keyword is not passed; the default will -change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`) - - -.. ipython:: python - - cat1 = pd.Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = pd.Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - df['C'] = ['foo', 'bar'] * 2 - df - -``observed`` must now be passed when grouping by categoricals, or a -``FutureWarning`` will show: - -.. ipython:: python - :okwarning: - - df.groupby(['A', 'B', 'C']).count() - - -To suppress the warning, with previous Behavior (show all values): - -.. ipython:: python - - df.groupby(['A', 'B', 'C'], observed=False).count() - - -Future Behavior (show only observed values): - -.. ipython:: python - - df.groupby(['A', 'B', 'C'], observed=True).count() - -For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: - -.. ipython:: python - - cat1 = pd.Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = pd.Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - df - -.. ipython:: python - - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=True) - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=False) - - .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 73dc933879942..63f0b742eb8b3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -557,7 +557,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, - observed=None, **kwargs): + observed=False, **kwargs): self._selection = selection @@ -2890,7 +2890,8 @@ class Grouping(object): obj : name : level : - observed : If we are a Categorical, use the observed values + observed : boolean, default False + If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -2906,7 +2907,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=None, in_axis=False): + sort=True, observed=False, in_axis=False): self.name = name self.level = level @@ -2963,17 +2964,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - # Use the observed values of the grouper if inidcated - observed = self.observed - if observed is None: - msg = ("pass observed=True to ensure that a " - "categorical grouper only returns the " - "observed categories, or\n" - "observed=False to also include" - "unobserved categories.\n") - warnings.warn(msg, FutureWarning, stacklevel=5) - observed = False - self.all_grouper = self.grouper self.grouper = self.grouper._codes_for_groupby( self.sort, observed) @@ -3092,7 +3082,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=None, mutated=False, validate=True): + observed=False, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ffd1d04e8b18f..923a79b6a7720 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -246,23 +246,6 @@ def test_apply(ordered): assert_series_equal(result, expected) -def test_observed_warning(): - # 20583 - future warning on observe - - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - df['C'] = ['foo', 'bar'] * 2 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.groupby(['A', 'B', 'C']) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.groupby('A') - - def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper From bdf7525812ca670f9406ab8df333030d36d30947 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 1 May 2018 06:06:50 -0400 Subject: [PATCH 6/6] more tests & change observed=None --- pandas/conftest.py | 8 +++++--- pandas/core/groupby/groupby.py | 12 +++++++++--- pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index cf83904e4fa13..c4aab1b632b00 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,12 +66,14 @@ def ip(): return InteractiveShell() -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, None]) def observed(request): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for - values which are not in the grouper [False], or only values which - appear in the grouper [True] """ + values which are not in the grouper [False / None], or only values which + appear in the grouper [True]. [None] is supported for future compatiblity + if we decide to change the default (and would need to warn if this + parameter is not passed)""" return request.param diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 63f0b742eb8b3..8613ab4d8c59d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -557,7 +557,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, - observed=False, **kwargs): + observed=None, **kwargs): self._selection = selection @@ -2907,7 +2907,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=False, in_axis=False): + sort=True, observed=None, in_axis=False): self.name = name self.level = level @@ -2964,6 +2964,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): + # observed can be True/False/None + # we treat None as False. If in the future + # we need to warn if observed is not passed + # then we have this option + # gh-20583 + self.all_grouper = self.grouper self.grouper = self.grouper._codes_for_groupby( self.sort, observed) @@ -3082,7 +3088,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=False, mutated=False, validate=True): + observed=None, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 923a79b6a7720..e0793b8e1bd64 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -395,6 +395,26 @@ def test_observed_perf(): assert result.index.levels[2].nunique() == df.other_id.nunique() +def test_observed_groups(observed): + # gh-20583 + # test that we have the appropriate groups + + cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + g = df.groupby('cat', observed=observed) + + result = g.groups + if observed: + expected = {'a': Index([0, 2], dtype='int64'), + 'c': Index([1], dtype='int64')} + else: + expected = {'a': Index([0, 2], dtype='int64'), + 'b': Index([], dtype='int64'), + 'c': Index([1], dtype='int64')} + + tm.assert_dict_equal(result, expected) + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4)