From 03ee26b5d18b8b459e1f27f32d92026e3456eaac Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 15 May 2019 16:04:21 -0700 Subject: [PATCH 1/9] Added test coverage for observed=False with ops --- pandas/tests/groupby/conftest.py | 7 ++++++ pandas/tests/groupby/test_categorical.py | 29 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index cb4fe511651ee..3cb07b98897d5 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -76,3 +76,10 @@ def three_group(): 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) + + +AGG_FUNCS = ['sum', 'prod', 'min', 'max', 'mean', 'median', 'var', 'first', + 'last', 'nth'] # TODO: ohlc? +@pytest.fixture(params=AGG_FUNCS) +def agg_func(request): + return request.param diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 112f7629d735a..fb7f79cd53973 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -433,6 +433,35 @@ def test_observed_groups_with_nan(observed): tm.assert_dict_equal(result, expected) +def test_observed_ops(agg_func): + cat = pd.Categorical(['a', np.nan, np.nan], categories=['a', 'b', 'c']) + ser = pd.Series([1., 2., 3.]) + df = pd.DataFrame({'cat': cat, 'ser': ser}) + + grp = df.groupby('cat', observed=False)['ser'] + func = getattr(grp, agg_func) + + if agg_func == 'nth': # Need an argument + result = func(0) + else: + result = func() + + if agg_func == 'sum': # TODO: maybe a bug? + expected_vals = [1., 0., 0.] + elif agg_func == 'prod': # TODO: Definitely seems like a bug + expected_vals = [1., 1., 1.] + elif agg_func == 'var': + expected_vals = [np.nan, np.nan, np.nan] + else: + expected_vals = [1., np.nan, np.nan] + + index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) + expected = pd.Series(expected_vals, index=index, name='ser') + expected.index.name = 'cat' + + tm.assert_series_equal(result, expected) + + def test_dataframe_categorical_with_nan(observed): # GH 21151 s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'], From ee549ed668d485d342c5ed9589eaea8327bb7401 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 15 May 2019 16:11:55 -0700 Subject: [PATCH 2/9] Fixed issue with observed=False and nth --- pandas/core/groupby/groupby.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ae28c23b85a39..a17bd8b2ad2b5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -42,7 +42,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -839,6 +839,7 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = {} + for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: @@ -1707,7 +1708,12 @@ def nth(self, if not self.as_index: return out - out.index = self.grouper.result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] + + if not self.observed and isinstance( + result_index, CategoricalIndex): + out = out.reindex(result_index) return out.sort_index() if self.sort else out From f0a510de33c9e68a49b1ac2c03780e31df256521 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 15 May 2019 16:15:25 -0700 Subject: [PATCH 3/9] Stubbed whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7924a029e72c3..191ac327b1a5c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -441,6 +441,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) Reshaping From e59a991e395cd6c13832dcb3b23bb12e06f3e01a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 17 May 2019 08:04:15 -0400 Subject: [PATCH 4/9] lint fixup --- pandas/tests/groupby/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 3cb07b98897d5..559f6b35b9ab6 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -80,6 +80,8 @@ def three_group(): AGG_FUNCS = ['sum', 'prod', 'min', 'max', 'mean', 'median', 'var', 'first', 'last', 'nth'] # TODO: ohlc? + + @pytest.fixture(params=AGG_FUNCS) def agg_func(request): return request.param From 36774714cdd1106befbefa952335756bb7920434 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 18 May 2019 19:19:09 -0700 Subject: [PATCH 5/9] Simplified test --- pandas/core/groupby/groupby.py | 1 - pandas/tests/groupby/conftest.py | 9 --------- pandas/tests/groupby/test_categorical.py | 22 ++++------------------ 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a17bd8b2ad2b5..b1f5139369e37 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -839,7 +839,6 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = {} - for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 559f6b35b9ab6..cb4fe511651ee 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -76,12 +76,3 @@ def three_group(): 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) - - -AGG_FUNCS = ['sum', 'prod', 'min', 'max', 'mean', 'median', 'var', 'first', - 'last', 'nth'] # TODO: ohlc? - - -@pytest.fixture(params=AGG_FUNCS) -def agg_func(request): - return request.param diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fb7f79cd53973..19f4f01f71c27 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -433,29 +433,15 @@ def test_observed_groups_with_nan(observed): tm.assert_dict_equal(result, expected) -def test_observed_ops(agg_func): +def test_observed_nth(): + # GH 26385 cat = pd.Categorical(['a', np.nan, np.nan], categories=['a', 'b', 'c']) ser = pd.Series([1., 2., 3.]) df = pd.DataFrame({'cat': cat, 'ser': ser}) - grp = df.groupby('cat', observed=False)['ser'] - func = getattr(grp, agg_func) + result = df.groupby('cat', observed=False)['ser'].nth(0) - if agg_func == 'nth': # Need an argument - result = func(0) - else: - result = func() - - if agg_func == 'sum': # TODO: maybe a bug? - expected_vals = [1., 0., 0.] - elif agg_func == 'prod': # TODO: Definitely seems like a bug - expected_vals = [1., 1., 1.] - elif agg_func == 'var': - expected_vals = [np.nan, np.nan, np.nan] - else: - expected_vals = [1., np.nan, np.nan] - - index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) + index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) expected = pd.Series(expected_vals, index=index, name='ser') expected.index.name = 'cat' From 2ca34e3fc2e403db7b981c7c04455b472464c19b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 18 May 2019 19:22:14 -0700 Subject: [PATCH 6/9] whatsnew whitespace fix --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/groupby/test_categorical.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 568b5ca63e351..20833ce6fcbbe 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -445,7 +445,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) Reshaping diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 19f4f01f71c27..6fe43ba4e56d6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -436,13 +436,13 @@ def test_observed_groups_with_nan(observed): def test_observed_nth(): # GH 26385 cat = pd.Categorical(['a', np.nan, np.nan], categories=['a', 'b', 'c']) - ser = pd.Series([1., 2., 3.]) + ser = pd.Series([1, 2, 3]) df = pd.DataFrame({'cat': cat, 'ser': ser}) result = df.groupby('cat', observed=False)['ser'].nth(0) - index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) - expected = pd.Series(expected_vals, index=index, name='ser') + index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) + expected = pd.Series([1, np.nan, np.nan], index=index, name='ser') expected.index.name = 'cat' tm.assert_series_equal(result, expected) From 47201fb6ccf6eac0f40c0f18efd6ee2876e7d27f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 09:06:38 -0700 Subject: [PATCH 7/9] blackify --- pandas/core/groupby/groupby.py | 3 +-- pandas/tests/groupby/test_categorical.py | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6ad92b6139af8..b852513e454a2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1774,8 +1774,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra result_index = self.grouper.result_index out.index = result_index[ids[mask]] - if not self.observed and isinstance( - result_index, CategoricalIndex): + if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) return out.sort_index() if self.sort else out diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0ee81845fa34c..03c9e873dede8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -433,15 +433,15 @@ def test_observed_groups_with_nan(observed): def test_observed_nth(): # GH 26385 - cat = pd.Categorical(['a', np.nan, np.nan], categories=['a', 'b', 'c']) + cat = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"]) ser = pd.Series([1, 2, 3]) - df = pd.DataFrame({'cat': cat, 'ser': ser}) + df = pd.DataFrame({"cat": cat, "ser": ser}) - result = df.groupby('cat', observed=False)['ser'].nth(0) + result = df.groupby("cat", observed=False)["ser"].nth(0) - index = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) - expected = pd.Series([1, np.nan, np.nan], index=index, name='ser') - expected.index.name = 'cat' + index = pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + expected = pd.Series([1, np.nan, np.nan], index=index, name="ser") + expected.index.name = "cat" tm.assert_series_equal(result, expected) From 1804e27a4f571ccbd66afe6f756efc9d9e7ad669 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 15 Jul 2019 10:54:10 -0700 Subject: [PATCH 8/9] Removed doc whitespace --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c913bef5f3fbf..af205652d2e33 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1141,7 +1141,7 @@ Groupby/resample/rolling - Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) - Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) - Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) Reshaping ^^^^^^^^^ From a8375642967719de0196370e071ba74975530d6f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 25 Jul 2019 15:41:00 -0700 Subject: [PATCH 9/9] moved whatsnew to 0.25.1 --- doc/source/whatsnew/v0.25.0.rst | 1 - doc/source/whatsnew/v0.25.1.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 70afce5e0e549..42e756635e739 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1207,7 +1207,6 @@ Groupby/resample/rolling - Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) - Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) - Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index eb60272246ebb..fb9e37c3a7e30 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -122,7 +122,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) -- +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Reshaping