From a6e34917f0efa97a12eef97e2bd80db2d344a942 Mon Sep 17 00:00:00 2001 From: LucasG0 Date: Sun, 20 Mar 2022 23:02:19 +0100 Subject: [PATCH 1/5] DataFrameGroupBy.value_counts includes non-observed categories of non-grouping columns --- doc/source/whatsnew/v1.5.0.rst | 6 +- pandas/core/groupby/generic.py | 18 ++- .../tests/groupby/test_frame_value_counts.py | 114 ++++++++++++++---- 3 files changed, 108 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b081f743f9b0b..2bd474271483d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -811,9 +811,9 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`) -- Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- +- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`) +- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) +- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9e26598d85e74..9c2caff40b498 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -69,6 +69,7 @@ reconstruct_func, validate_func_kwargs, ) +from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -87,6 +88,7 @@ MultiIndex, all_indexes_same, ) +from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.core.util.numba_ import maybe_use_numba @@ -1819,6 +1821,7 @@ def value_counts( key=key, axis=self.axis, sort=self.sort, + observed=False, dropna=dropna, ) groupings += list(grouper.groupings) @@ -1832,6 +1835,19 @@ def value_counts( ) result_series = cast(Series, gb.size()) + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.group_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the @@ -1842,10 +1858,8 @@ def value_counts( indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), sort=self.sort, - observed=self.observed, dropna=self.dropna, ).transform("sum") - result_series /= indexed_group_size if sort: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index affef05ba4ed3..bd53aea80f54d 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -308,35 +308,35 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame_groupby, expected) -@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "observed, expected_index", [ ( False, [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), + ("FR", "high", "female"), + ("FR", "high", "male"), + ("FR", "low", "male"), + ("FR", "low", "female"), + ("FR", "medium", "male"), + ("FR", "medium", "female"), + ("US", "high", "female"), + ("US", "high", "male"), + ("US", "low", "male"), + ("US", "low", "female"), + ("US", "medium", "female"), + ("US", "medium", "male"), ], ), ( True, [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), + ("FR", "high", "female"), + ("FR", "low", "male"), + ("FR", "medium", "male"), + ("US", "high", "female"), + ("US", "low", "male"), ], ), ], @@ -344,30 +344,94 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize( "normalize, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + (False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)), ( True, - np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + # NaN values corresponds to non-observed groups + np.array( + [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan] + ), ), ], ) -def test_categorical( +def test_categorical_groupers( education_df, as_index, observed, expected_index, normalize, expected_data ): - # Test categorical data whether or not observed - gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed + education_df = education_df.copy() + education_df["country"] = education_df["country"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby( + ["country", "education"], as_index=as_index, observed=observed ) result = gp.value_counts(normalize=normalize) expected_series = Series( data=expected_data[expected_data > 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "education", "gender"], + ), + ) + for i in range(2): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + # NaN values corresponds to non-observed groups + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_values(education_df, as_index, observed, normalize, expected_data): + # Test non-observed categories are included in the result, + # regardless of `observed` + education_df = education_df.copy() + education_df["gender"] = education_df["gender"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + expected_series = Series( + data=expected_data, index=MultiIndex.from_tuples( expected_index, names=["country", "gender", "education"], ), ) - for i in range(3): + for i in range(1, 3): expected_series.index = expected_series.index.set_levels( CategoricalIndex(expected_series.index.levels[i]), level=i ) From ba76d71d6e1293df4ef03b22e78d5a4374ab4369 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 18 May 2022 01:41:33 +0200 Subject: [PATCH 2/5] Fix unobserved categories dtype and normalize behavior + add tests for single grouper --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/groupby/generic.py | 5 +- .../tests/groupby/test_frame_value_counts.py | 165 +++++++++++++++++- 3 files changed, 164 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2bd474271483d..430fc3543e55d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -813,7 +813,7 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`) - Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9c2caff40b498..5f4c5de96871e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1842,7 +1842,7 @@ def value_counts( and not grouping._observed for grouping in groupings ): - levels_list = [ping.group_index for ping in groupings] + levels_list = [ping.result_index for ping in groupings] multi_index, _ = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] ).sortlevel() @@ -1862,6 +1862,9 @@ def value_counts( ).transform("sum") result_series /= indexed_group_size + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index bd53aea80f54d..55d81ac78239b 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -308,6 +308,156 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame_groupby, expected) +def _test_categorical_single_grouper( + education_df, as_index, observed, expected_index, normalize, expected_data +): + # Test single categorical grouper when non-groupers are also categorical + education_df = education_df.copy().astype("category") + + # Add non-observed grouping categories + education_df["country"] = education_df["country"].cat.add_categories(["ASIA"]) + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + ) + for i in range(3): + index_level = CategoricalIndex(expected_series.index.levels[i]) + if i == 0: + index_level = index_level.set_categories( + education_df["country"].cat.categories + ) + expected_series.index = expected_series.index.set_levels(index_level, level=i) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_observed_true( + education_df, as_index, normalize, expected_data +): + # GH#46357 + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + + _test_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=True, + expected_index=expected_index, + normalize=normalize, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + ( + False, + np.array( + [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 + ), + ), + ( + True, + np.array( + [ + 0.5, + 0.25, + 0.25, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ), + ), + ], +) +def test_categorical_single_grouper_observed_false( + education_df, as_index, normalize, expected_data +): + # GH#46357 + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "male", "high"), + ("FR", "female", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "male", "medium"), + ("US", "male", "high"), + ("US", "female", "medium"), + ("US", "female", "low"), + ("ASIA", "male", "low"), + ("ASIA", "male", "high"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "low"), + ("ASIA", "female", "high"), + ("ASIA", "male", "medium"), + ] + + _test_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=False, + expected_index=expected_index, + normalize=normalize, + expected_data=expected_data, + ) + + @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "observed, expected_index", @@ -348,15 +498,16 @@ def test_data_frame_value_counts_dropna( ( True, # NaN values corresponds to non-observed groups - np.array( - [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan] - ), + np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), ), ], ) -def test_categorical_groupers( +def test_categorical_multiple_groupers( education_df, as_index, observed, expected_index, normalize, expected_data ): + # GH#46357 + + # Test multiple categorical groupers when non-groupers are non-categorical education_df = education_df.copy() education_df["country"] = education_df["country"].astype("category") education_df["education"] = education_df["education"].astype("category") @@ -400,8 +551,10 @@ def test_categorical_groupers( ), ], ) -def test_categorical_values(education_df, as_index, observed, normalize, expected_data): - # Test non-observed categories are included in the result, +def test_categorical_non_groupers( + education_df, as_index, observed, normalize, expected_data +): + # GH#46357 Test non-observed categories are included in the result, # regardless of `observed` education_df = education_df.copy() education_df["gender"] = education_df["gender"].astype("category") From 8cc3e498454af875842e4dc530340d3f62e72c8d Mon Sep 17 00:00:00 2001 From: LucasG0 Date: Sun, 22 May 2022 17:10:17 +0200 Subject: [PATCH 3/5] Rename to _test_categorical_single_grouper to assert_categorical_single_grouper --- pandas/tests/groupby/test_frame_value_counts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 55d81ac78239b..ba2018efd230c 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -308,7 +308,7 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame_groupby, expected) -def _test_categorical_single_grouper( +def assert_categorical_single_grouper( education_df, as_index, observed, expected_index, normalize, expected_data ): # Test single categorical grouper when non-groupers are also categorical @@ -375,7 +375,7 @@ def test_categorical_single_grouper_observed_true( ("US", "male", "medium"), ] - _test_categorical_single_grouper( + assert_categorical_single_grouper( education_df=education_df, as_index=as_index, observed=True, @@ -448,7 +448,7 @@ def test_categorical_single_grouper_observed_false( ("ASIA", "male", "medium"), ] - _test_categorical_single_grouper( + assert_categorical_single_grouper( education_df=education_df, as_index=as_index, observed=False, From 81aa3085eae0415d9a360c8aac01d163366af5db Mon Sep 17 00:00:00 2001 From: Lucas Date: Mon, 6 Jun 2022 19:01:17 +0200 Subject: [PATCH 4/5] Restore original single grouper test + add notable fix sub-section --- doc/source/whatsnew/v1.5.0.rst | 40 +++++++++++++ .../tests/groupby/test_frame_value_counts.py | 60 +++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 430fc3543e55d..f4ea7a843edf9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -389,6 +389,46 @@ upon serialization. (Related issue :issue:`12997`) # Roundtripping now works pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index + +DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`). + +.. code-block:: ipython + + In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2] + In [7]: df + Out[7]: + 0 + 0 a + 1 b + +*Old Behavior* + +.. code-block:: ipython + + In [8]: df.groupby(level=0, observed=True).value_counts() + Out[8]: + 0 a 1 + 1 b 1 + dtype: int64 + + +*New Behavior* + +.. code-block:: ipython + + In [9]: df.groupby(level=0, observed=True).value_counts() + Out[9]: + 0 a 1 + 1 a 0 + b 1 + 0 b 0 + c 0 + 1 c 0 + dtype: int64 + .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index ba2018efd230c..1e679ad4e7aad 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -308,6 +308,66 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame_groupby, expected) +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_with_only_observed_categories( + education_df, as_index, observed, normalize, expected_data +): + + # Test single categorical grouper with only observed grouping categories + # when non-groupers are also categorical + + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_index = MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + names=["country", "gender", "education"], + ) + + expected_series = Series( + data=expected_data, + index=expected_index, + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + def assert_categorical_single_grouper( education_df, as_index, observed, expected_index, normalize, expected_data ): From fa3869c28388981a9022efa204077bd486e885ec Mon Sep 17 00:00:00 2001 From: Lucas Date: Mon, 13 Jun 2022 14:45:59 +0200 Subject: [PATCH 5/5] Fix what's new: remove extra line and add notable_bug_fixes line --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index f4ea7a843edf9..9abc2966d2aa1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -390,6 +390,8 @@ upon serialization. (Related issue :issue:`12997`) pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index +.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: + DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -853,7 +855,6 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`) - Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- Bug in :meth:`.DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`) Datetimelike ^^^^^^^^^^^^