diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 578431e9ab2d8..127fc97d87124 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -393,6 +393,48 @@ upon serialization. (Related issue :issue:`12997`) # Roundtripping now works pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index + +.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: + +DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`). + +.. code-block:: ipython + + In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2] + In [7]: df + Out[7]: + 0 + 0 a + 1 b + +*Old Behavior* + +.. code-block:: ipython + + In [8]: df.groupby(level=0, observed=True).value_counts() + Out[8]: + 0 a 1 + 1 b 1 + dtype: int64 + + +*New Behavior* + +.. code-block:: ipython + + In [9]: df.groupby(level=0, observed=True).value_counts() + Out[9]: + 0 a 1 + 1 a 0 + b 1 + 0 b 0 + c 0 + 1 c 0 + dtype: int64 + .. --------------------------------------------------------------------------- .. _whatsnew_150.api_breaking: @@ -820,9 +862,8 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`) -- Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) -- +- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`) +- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 631f70f390319..8c9ba452dd0b0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -69,6 +69,7 @@ reconstruct_func, validate_func_kwargs, ) +from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -87,6 +88,7 @@ MultiIndex, all_indexes_same, ) +from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.core.util.numba_ import maybe_use_numba @@ -1821,6 +1823,7 @@ def value_counts( key=key, axis=self.axis, sort=self.sort, + observed=False, dropna=dropna, ) groupings += list(grouper.groupings) @@ -1834,6 +1837,19 @@ def value_counts( ) result_series = cast(Series, gb.size()) + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.result_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the @@ -1844,12 +1860,13 @@ def value_counts( indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), sort=self.sort, - observed=self.observed, dropna=self.dropna, ).transform("sum") - result_series /= indexed_group_size + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index affef05ba4ed3..1e679ad4e7aad 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -309,34 +309,244 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_with_only_observed_categories( + education_df, as_index, observed, normalize, expected_data +): + + # Test single categorical grouper with only observed grouping categories + # when non-groupers are also categorical + + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_index = MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + names=["country", "gender", "education"], + ) + + expected_series = Series( + data=expected_data, + index=expected_index, + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +def assert_categorical_single_grouper( + education_df, as_index, observed, expected_index, normalize, expected_data +): + # Test single categorical grouper when non-groupers are also categorical + education_df = education_df.copy().astype("category") + + # Add non-observed grouping categories + education_df["country"] = education_df["country"].cat.add_categories(["ASIA"]) + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + ) + for i in range(3): + index_level = CategoricalIndex(expected_series.index.levels[i]) + if i == 0: + index_level = index_level.set_categories( + education_df["country"].cat.categories + ) + expected_series.index = expected_series.index.set_levels(index_level, level=i) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_observed_true( + education_df, as_index, normalize, expected_data +): + # GH#46357 + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=True, + expected_index=expected_index, + normalize=normalize, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + ( + False, + np.array( + [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 + ), + ), + ( + True, + np.array( + [ + 0.5, + 0.25, + 0.25, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ), + ), + ], +) +def test_categorical_single_grouper_observed_false( + education_df, as_index, normalize, expected_data +): + # GH#46357 + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "male", "high"), + ("FR", "female", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "male", "medium"), + ("US", "male", "high"), + ("US", "female", "medium"), + ("US", "female", "low"), + ("ASIA", "male", "low"), + ("ASIA", "male", "high"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "low"), + ("ASIA", "female", "high"), + ("ASIA", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=False, + expected_index=expected_index, + normalize=normalize, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "observed, expected_index", [ ( False, [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), + ("FR", "high", "female"), + ("FR", "high", "male"), + ("FR", "low", "male"), + ("FR", "low", "female"), + ("FR", "medium", "male"), + ("FR", "medium", "female"), + ("US", "high", "female"), + ("US", "high", "male"), + ("US", "low", "male"), + ("US", "low", "female"), + ("US", "medium", "female"), + ("US", "medium", "male"), ], ), ( True, [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), + ("FR", "high", "female"), + ("FR", "low", "male"), + ("FR", "medium", "male"), + ("US", "high", "female"), + ("US", "low", "male"), ], ), ], @@ -344,30 +554,97 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize( "normalize, expected_data", [ - (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + (False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)), ( True, - np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + # NaN values corresponds to non-observed groups + np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), ), ], ) -def test_categorical( +def test_categorical_multiple_groupers( education_df, as_index, observed, expected_index, normalize, expected_data ): - # Test categorical data whether or not observed - gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed + # GH#46357 + + # Test multiple categorical groupers when non-groupers are non-categorical + education_df = education_df.copy() + education_df["country"] = education_df["country"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby( + ["country", "education"], as_index=as_index, observed=observed ) result = gp.value_counts(normalize=normalize) expected_series = Series( data=expected_data[expected_data > 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "education", "gender"], + ), + ) + for i in range(2): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + # NaN values corresponds to non-observed groups + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_non_groupers( + education_df, as_index, observed, normalize, expected_data +): + # GH#46357 Test non-observed categories are included in the result, + # regardless of `observed` + education_df = education_df.copy() + education_df["gender"] = education_df["gender"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + expected_series = Series( + data=expected_data, index=MultiIndex.from_tuples( expected_index, names=["country", "gender", "education"], ), ) - for i in range(3): + for i in range(1, 3): expected_series.index = expected_series.index.set_levels( CategoricalIndex(expected_series.index.levels[i]), level=i )