From 963b7e1267e04848b1fc33aadd45d565529cd6f7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 08:28:04 +0100 Subject: [PATCH 01/77] Add DataFrameGroupBy.value_counts --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/groupby/generic.py | 136 +++++++++++- .../tests/groupby/test_frame_value_counts.py | 208 ++++++++++++++++++ 3 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/groupby/test_frame_value_counts.py diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5601048c409e1..0dbfed6e98ec9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -181,6 +181,7 @@ Other enhancements - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) +- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8a330d08bef78..7a2dcb9fdcce6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -15,8 +15,10 @@ Callable, Hashable, Iterable, + List, Mapping, NamedTuple, + Sequence, TypeVar, Union, cast, @@ -67,7 +69,10 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.core.groupby import ( + base, + ops, +) from pandas.core.groupby.groupby import ( GroupBy, _agg_template, @@ -75,6 +80,11 @@ _transform_template, warn_dropping_nuisance_columns_deprecated, ) +from pandas.core.groupby.grouper import ( + Grouper, + Grouping, + get_grouper, +) from pandas.core.indexes.api import ( Index, MultiIndex, @@ -1568,6 +1578,130 @@ def func(df): boxplot = boxplot_frame_groupby + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + with self._group_selection_context(): + df = self.obj + + # Check for index rather than column grouping + index_grouping = self.grouper.groupings[0].name is None + + # Try to find column names + if index_grouping: + keys = [] + remaining_columns = self._selected_obj.columns + elif isinstance(self._selected_obj, Series): + keys = [grouping.name for grouping in self.grouper.groupings] + remaining_columns = [self._selected_obj.name] + else: + if isinstance(self.keys, ops.BaseGrouper): + keys = [grouping.name for grouping in self.keys.groupings] + elif isinstance(self.keys, str): + keys = [self.keys] + else: + keys = cast(List[str], self.keys) + + remaining_columns = [ + key for key in self._selected_obj.columns if key not in keys + ] + + if subset is not None: + remaining_columns = [key for key in subset if key not in keys] + + if dropna: + df = df.dropna(subset=remaining_columns, axis="index", how="any") + + grouper, _, _ = get_grouper( + df, + key=self.keys, + axis=self.axis, + level=self.level, + sort=self.sort, + mutated=self.mutated, + ) + + groupings = grouper.groupings + [ + cast(Grouping, Grouper(key)) for key in remaining_columns + ] + + result = df.groupby( + groupings, + as_index=self.as_index, + sort=self.sort, + dropna=dropna, + ).size() + result.name = "size" + + if normalize: + indexed_group_size = df.groupby( + grouper, sort=self.sort, dropna=dropna + ).size() + if self.as_index: + if index_grouping: + # The common index needs a common name + indexed_group_size.index.set_names("Group", inplace=True) + result.index.set_names("Group", level=0, inplace=True) + # Use indexed group size series + result /= indexed_group_size + if index_grouping: + result.index.set_names(None, level=0, inplace=True) + else: + # Make indexed key group size series + indexed_group_size.name = "group_size" + if index_grouping: + # Get the column name of the added groupby index column + index_column_name = result.columns[0] + indexed_group_size.index.set_names( + index_column_name, inplace=True + ) + left_on = index_column_name + else: + left_on = keys + if not index_grouping and len(keys) == 1: + # Compose with single key group size series + group_size = indexed_group_size[result[keys[0]]] + else: + # Merge multiple key group size series + merged = result.merge( + indexed_group_size, + how="left", + left_on=left_on, + right_index=True, + ) + group_size = merged["group_size"] + + result["size"] /= group_size.values + + if sort: + if self.as_index: + if index_grouping: + level: Any = 0 + else: + level = keys + result = ( + cast(Series, result) + .sort_values(ascending=ascending) + .sort_index(level=level, sort_remaining=False) + ) + else: + if index_grouping: + by: Any = "level_0" + else: + by = keys + result = ( + cast(DataFrame, result) + .sort_values(by="size", ascending=ascending) + .sort_values(by=by, ascending=True) + ) + + return result + def _wrap_transform_general_frame( obj: DataFrame, group: DataFrame, res: DataFrame | Series diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py new file mode 100644 index 0000000000000..4e0b6b5622a16 --- /dev/null +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -0,0 +1,208 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def education_df(): + return pd.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + + +@pytest.fixture +def country_index(): + return ["U", "F", "U", "F", "F", "F"] + + +def _frame_value_counts(df, keys, normalize, sort, ascending): + return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) + + +@pytest.mark.parametrize("column", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending", + [ + (False, None), + (True, True), + (True, False), + ], +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_basic( + education_df, country_index, column, normalize, sort, ascending, as_index, frame +): + # gh43564 with added: + # - Use column or index + # - Whether or not to normalize + # - Whether or not to sort and how + # - Whether or not to use the groupby as an index + # - 3-way compare against :meth:`~DataFrame.value_counts` + # and `~SeriesGroupBy.value_counts` + gp = education_df.groupby("country" if column else country_index, as_index=as_index) + result = gp[["gender", "education"]].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + + if frame: + # compare against apply with DataFrame value_counts + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) + expected.name = "size" + if as_index: + tm.assert_series_equal(result, expected) + else: + assert np.array_equal(result["size"].values, expected.values) + elif column or as_index: + # (otherwise SeriesGroupby crashes) + # compare against SeriesGroupBy value_counts + education_df["both"] = education_df["gender"] + "-" + education_df["education"] + expected = gp["both"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + if as_index: + assert np.array_equal(result.values, expected.values) + else: + assert np.array_equal(result["size"].values, expected["size"].values) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending", + [ + (False, None), + (True, True), + (True, False), + ], +) +def test_compound(education_df, normalize, sort, ascending): + # Multiple groupby keys and as_index=False + gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) + result = gp["education"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + + if sort: + # compare against apply with DataFrame value_counts + expected = gp.apply( + _frame_value_counts, "education", normalize, sort, ascending + ).values + else: + expected = [1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0] if normalize else [1, 1, 1, 2, 1] + + assert np.array_equal(result["size"].values, expected) + + +@pytest.fixture +def animals_df(): + return pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + +@pytest.mark.parametrize( + "sort, ascending, normalize, expected_data, expected_index", + [ + (False, None, False, [1, 2, 1], [(2, 4, 6), (2, 0, 0)]), + (True, True, False, [1, 1, 2], [(2, 6, 4), (2, 0, 0)]), + (True, False, False, [2, 1, 1], [(4, 2, 6), (0, 2, 0)]), + (True, False, False, [2, 1, 1], [(4, 2, 6), (0, 2, 0)]), + (True, False, True, [0.5, 0.25, 0.25], [(4, 2, 6), (0, 2, 0)]), + ], +) +def test_data_frame_value_counts( + animals_df, sort, ascending, normalize, expected_data, expected_index +): + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests from frame/methods/test_value_counts.py + result_frame = animals_df.value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + expected = pd.Series( + data=expected_data, + index=pd.MultiIndex.from_arrays( + expected_index, names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result_frame, expected) + + animals_df["key"] = 1 + + result_frame_groupby = animals_df.groupby("key").value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + result_frame_groupby.reset_index(drop=True, level="key", inplace=True) + result_frame_groupby.name = None + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.fixture +def names_with_nulls_df(nulls_fixture): + return pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + +@pytest.mark.parametrize( + "dropna, expected_data, expected_index", + [ + ( + True, + [1, 1], + pd.MultiIndex.from_arrays( + [("Beth", "John"), ("Louise", "Smith")], + names=["first_name", "middle_name"], + ), + ), + ( + False, + [1, 1, 1, 1], + pd.MultiIndex( + levels=[ + pd.Index(["Anne", "Beth", "John"]), + pd.Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + names=["first_name", "middle_name"], + ), + ), + ], +) +@pytest.mark.parametrize("normalize", [False, True]) +def test_data_frame_value_counts_dropna( + names_with_nulls_df, dropna, normalize, expected_data, expected_index +): + # GH 41334 + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests with nulls from frame/methods/test_value_counts.py + result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) + expected = pd.Series( + data=expected_data, + index=expected_index, + ) + if normalize: + expected /= float(len(expected_data)) + + tm.assert_series_equal(result_frame, expected) + + names_with_nulls_df["key"] = 1 + result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( + dropna=dropna, normalize=normalize + ) + result_frame_groupby.reset_index(drop=True, level="key", inplace=True) + result_frame_groupby.name = None + + tm.assert_series_equal(result_frame_groupby, expected) From 1f710e07c1ef9c00cd904fc914364ac6384b7349 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 08:53:20 +0100 Subject: [PATCH 02/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 4e0b6b5622a16..5ed153b154784 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -61,7 +61,7 @@ def test_basic( if as_index: tm.assert_series_equal(result, expected) else: - assert np.array_equal(result["size"].values, expected.values) + tm.assert_numpy_array_equal(result["size"].values, expected.values) elif column or as_index: # (otherwise SeriesGroupby crashes) # compare against SeriesGroupBy value_counts @@ -70,9 +70,9 @@ def test_basic( normalize=normalize, sort=sort, ascending=ascending ) if as_index: - assert np.array_equal(result.values, expected.values) + tm.assert_numpy_array_equal(result.values, expected.values) else: - assert np.array_equal(result["size"].values, expected["size"].values) + tm.assert_numpy_array_equal(result["size"].values, expected["size"].values) @pytest.mark.parametrize("normalize", [True, False]) @@ -96,10 +96,12 @@ def test_compound(education_df, normalize, sort, ascending): expected = gp.apply( _frame_value_counts, "education", normalize, sort, ascending ).values + elif normalize: + expected = np.array([1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0]) else: - expected = [1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0] if normalize else [1, 1, 1, 2, 1] + expected = np.array([1, 1, 1, 2, 1], dtype=np.int64) - assert np.array_equal(result["size"].values, expected) + tm.assert_numpy_array_equal(result["size"].values, expected) @pytest.fixture From 35313837419229248fa4f920a780327d961ee1a1 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 09:18:08 +0100 Subject: [PATCH 03/77] Catch axis=1 --- pandas/core/groupby/generic.py | 4 ++++ pandas/tests/groupby/test_frame_value_counts.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7a2dcb9fdcce6..b8f7682f3c4e5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1586,6 +1586,10 @@ def value_counts( ascending: bool = False, dropna: bool = True, ) -> DataFrame | Series: + + if self.axis == 1: + raise NotImplementedError("DataFrameGroupBy.value_counts only handles axis=0") + with self._group_selection_context(): df = self.obj diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 5ed153b154784..4e6bdcd0572d4 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -208,3 +208,9 @@ def test_data_frame_value_counts_dropna( result_frame_groupby.name = None tm.assert_series_equal(result_frame_groupby, expected) + + +def test_axis(animals_df): + gp = animals_df.groupby([0, 0], axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() \ No newline at end of file From d7f733b6b6d8360aaf96d76e75f12879f139eac6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 09:26:45 +0100 Subject: [PATCH 04/77] Add to base and tab_completion --- pandas/core/groupby/base.py | 1 + pandas/tests/groupby/test_allowlist.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 986aaa07a913c..48faa1fc46759 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -143,6 +143,7 @@ class OutputKey: "take", "transform", "sample", + "value_counts", ] ) # Valid values of `name` for `groupby.transform(name)` diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index aa7229f1ab7b3..717d8b5d910c9 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -338,6 +338,7 @@ def test_tab_completion(mframe): "pipe", "sample", "ewm", + "value_counts", } assert results == expected From eb067ecce83556ef7b276be8ba3121b73cdac957 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 09:29:38 +0100 Subject: [PATCH 05/77] Line too long --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b8f7682f3c4e5..f3eb3ff16198c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1588,7 +1588,9 @@ def value_counts( ) -> DataFrame | Series: if self.axis == 1: - raise NotImplementedError("DataFrameGroupBy.value_counts only handles axis=0") + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) with self._group_selection_context(): df = self.obj From a6a07d1818a01302e6c8ad09e29724bc2885391f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 09:30:30 +0100 Subject: [PATCH 06/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 4e6bdcd0572d4..e3000b0efba76 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -213,4 +213,4 @@ def test_data_frame_value_counts_dropna( def test_axis(animals_df): gp = animals_df.groupby([0, 0], axis=1) with pytest.raises(NotImplementedError, match="axis"): - gp.value_counts() \ No newline at end of file + gp.value_counts() From 6a22a5747f34fc3f345ac9afbafff09a74db7515 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 10:20:37 +0100 Subject: [PATCH 07/77] Add docstring --- pandas/core/groupby/generic.py | 64 ++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f3eb3ff16198c..154efcd387d4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1586,7 +1586,71 @@ def value_counts( ascending: bool = False, dropna: bool = True, ) -> DataFrame | Series: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. + Returns + ------- + Series or DataFrame + Series if as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. + By default, rows that contain any NA values are omitted from + the result. By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby("gender").value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: size, dtype: float64 + """ if self.axis == 1: raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" From 9492ee4fa7e648f255cfa8939a13d4de9c66b414 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 10:41:35 +0100 Subject: [PATCH 08/77] Update generic.py --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 154efcd387d4d..b33661900f575 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1618,9 +1618,9 @@ def value_counts( Notes ----- If the groupby as_index is True then the returned Series will have a - MultiIndex with one level per input column. + MultiIndex with one level per input column. If the groupby as_index is False then the returned DataFrame will have an - additional column with the value_counts. + additional column with the value_counts. By default, rows that contain any NA values are omitted from the result. By default, the result will be in descending order so that the first element of each group is the most frequently-occurring row. @@ -1628,7 +1628,7 @@ def value_counts( Examples -------- >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] ... }) From b9885fd2210499e2d2caa49107969d8078cd557e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 11:35:34 +0100 Subject: [PATCH 09/77] Update groupby.rst --- doc/source/reference/groupby.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index ccf130d03418c..2bb0659264eb0 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -122,6 +122,7 @@ application to columns of a specific data type. DataFrameGroupBy.skew DataFrameGroupBy.take DataFrameGroupBy.tshift + DataFrameGroupBy.value_counts The following methods are available only for ``SeriesGroupBy`` objects. From e896879c7b31f97a696e2180e2ce9b56ba89f7a6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 13:04:51 +0100 Subject: [PATCH 10/77] generic.py types --- pandas/core/groupby/generic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b33661900f575..512208ed10f4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -15,7 +15,6 @@ Callable, Hashable, Iterable, - List, Mapping, NamedTuple, Sequence, @@ -1664,7 +1663,7 @@ def value_counts( # Try to find column names if index_grouping: - keys = [] + keys: Any = [] remaining_columns = self._selected_obj.columns elif isinstance(self._selected_obj, Series): keys = [grouping.name for grouping in self.grouper.groupings] @@ -1675,7 +1674,7 @@ def value_counts( elif isinstance(self.keys, str): keys = [self.keys] else: - keys = cast(List[str], self.keys) + keys = self.keys remaining_columns = [ key for key in self._selected_obj.columns if key not in keys From 5b49322b300c7c620f692b5efc9e5db8bea3aa9e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 1 Nov 2021 16:43:52 +0100 Subject: [PATCH 11/77] Add observed parameter --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 512208ed10f4a..374653fdb3330 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1703,13 +1703,14 @@ def value_counts( groupings, as_index=self.as_index, sort=self.sort, + observed=self.observed, dropna=dropna, ).size() result.name = "size" if normalize: indexed_group_size = df.groupby( - grouper, sort=self.sort, dropna=dropna + grouper, sort=self.sort, observed=self.observed, dropna=dropna ).size() if self.as_index: if index_grouping: From 26353ee9110f96f8de4d38057aeb2041f9dd4f3c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 01:27:47 +0100 Subject: [PATCH 12/77] Change output name to "count" and deal with categorical data --- pandas/core/groupby/generic.py | 38 ++++++++++++++----- .../tests/groupby/test_frame_value_counts.py | 25 +++++++----- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 374653fdb3330..c08e4dc389f34 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1648,8 +1648,9 @@ def value_counts( male low FR 0.50 US 0.25 medium FR 0.25 - Name: size, dtype: float64 + Name: count, dtype: float64 """ + RESULT_NAME = "count" if self.axis == 1: raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" @@ -1686,7 +1687,8 @@ def value_counts( if dropna: df = df.dropna(subset=remaining_columns, axis="index", how="any") - grouper, _, _ = get_grouper( + # Add the remaining_column keys to the main grouper + main_grouper, _, _ = get_grouper( df, key=self.keys, axis=self.axis, @@ -1694,11 +1696,20 @@ def value_counts( sort=self.sort, mutated=self.mutated, ) + groupings = list(main_grouper.groupings) + for key in remaining_columns: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + level=self.level, + sort=self.sort, + mutated=self.mutated, + dropna=dropna, + ) + groupings += list(grouper.groupings) - groupings = grouper.groupings + [ - cast(Grouping, Grouper(key)) for key in remaining_columns - ] - + # Take the size of the overall columns result = df.groupby( groupings, as_index=self.as_index, @@ -1706,11 +1717,17 @@ def value_counts( observed=self.observed, dropna=dropna, ).size() - result.name = "size" + + # Change the nameof the size result + if self.as_index: + result.name = RESULT_NAME + else: + result.rename({"size": RESULT_NAME}, axis=1, inplace=True) if normalize: + # Normalize the results be dividing by the original group sizes indexed_group_size = df.groupby( - grouper, sort=self.sort, observed=self.observed, dropna=dropna + main_grouper, sort=self.sort, observed=self.observed, dropna=dropna ).size() if self.as_index: if index_grouping: @@ -1746,9 +1763,10 @@ def value_counts( ) group_size = merged["group_size"] - result["size"] /= group_size.values + result[RESULT_NAME] /= group_size.values if sort: + # Sort the values and then resort by the main grouping if self.as_index: if index_grouping: level: Any = 0 @@ -1766,7 +1784,7 @@ def value_counts( by = keys result = ( cast(DataFrame, result) - .sort_values(by="size", ascending=ascending) + .sort_values(by=RESULT_NAME, ascending=ascending) .sort_values(by=by, ascending=True) ) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index e3000b0efba76..0457eaeb5b471 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -4,6 +4,14 @@ import pandas as pd import pandas._testing as tm +RESULT_NAME = "count" + + +def test_axis(animals_df): + gp = animals_df.groupby([0, 0], axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() + @pytest.fixture def education_df(): @@ -57,11 +65,12 @@ def test_basic( expected = gp.apply( _frame_value_counts, ["gender", "education"], normalize, sort, ascending ) - expected.name = "size" + + expected.name = RESULT_NAME if as_index: tm.assert_series_equal(result, expected) else: - tm.assert_numpy_array_equal(result["size"].values, expected.values) + tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected.values) elif column or as_index: # (otherwise SeriesGroupby crashes) # compare against SeriesGroupBy value_counts @@ -72,7 +81,9 @@ def test_basic( if as_index: tm.assert_numpy_array_equal(result.values, expected.values) else: - tm.assert_numpy_array_equal(result["size"].values, expected["size"].values) + tm.assert_numpy_array_equal( + result[RESULT_NAME].values, expected[RESULT_NAME].values + ) @pytest.mark.parametrize("normalize", [True, False]) @@ -101,7 +112,7 @@ def test_compound(education_df, normalize, sort, ascending): else: expected = np.array([1, 1, 1, 2, 1], dtype=np.int64) - tm.assert_numpy_array_equal(result["size"].values, expected) + tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected) @pytest.fixture @@ -208,9 +219,3 @@ def test_data_frame_value_counts_dropna( result_frame_groupby.name = None tm.assert_series_equal(result_frame_groupby, expected) - - -def test_axis(animals_df): - gp = animals_df.groupby([0, 0], axis=1) - with pytest.raises(NotImplementedError, match="axis"): - gp.value_counts() From 0e065b3c49ce36ad2e58ba8e8abb89ffd8225828 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 09:33:38 +0100 Subject: [PATCH 13/77] Update generic.py --- pandas/core/groupby/generic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c08e4dc389f34..288c6bf81ba46 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -79,11 +79,7 @@ _transform_template, warn_dropping_nuisance_columns_deprecated, ) -from pandas.core.groupby.grouper import ( - Grouper, - Grouping, - get_grouper, -) +from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( Index, MultiIndex, @@ -1695,6 +1691,7 @@ def value_counts( level=self.level, sort=self.sort, mutated=self.mutated, + dropna=dropna, ) groupings = list(main_grouper.groupings) for key in remaining_columns: From b821fca7b281edcddbbf2c10f631bb60e0b52c13 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 10:58:00 +0100 Subject: [PATCH 14/77] Add test_categorical --- .../tests/groupby/test_frame_value_counts.py | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 0457eaeb5b471..355e6be4cba57 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -104,15 +104,15 @@ def test_compound(education_df, normalize, sort, ascending): if sort: # compare against apply with DataFrame value_counts - expected = gp.apply( + expected_values = gp.apply( _frame_value_counts, "education", normalize, sort, ascending ).values elif normalize: - expected = np.array([1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0]) + expected_values = np.array([1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0]) else: - expected = np.array([1, 1, 1, 2, 1], dtype=np.int64) + expected_values = np.array([1, 1, 1, 2, 1], dtype=np.int64) - tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected) + tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected_values) @pytest.fixture @@ -219,3 +219,37 @@ def test_data_frame_value_counts_dropna( result_frame_groupby.name = None tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize("normalize", [False, True]) +def test_categorical(education_df, as_index, observed, normalize): + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + if observed: + gp = education_df.groupby("country", as_index=as_index) + expected = gp.value_counts(normalize=normalize) + if as_index: + tm.assert_numpy_array_equal(result.values, expected.values) + else: + tm.assert_numpy_array_equal( + result[RESULT_NAME].values, expected[RESULT_NAME].values + ) + else: + if normalize: + expected_values = np.array( + [0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] + ) + else: + expected_values = np.array( + [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64 + ) + + if as_index: + tm.assert_numpy_array_equal(result.values, expected_values) + else: + tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected_values) From 19d7257b26f1bd2ea18cdc1a749d8538c67003c2 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 11:26:44 +0100 Subject: [PATCH 15/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 355e6be4cba57..06f7432e698e6 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -129,7 +129,6 @@ def animals_df(): (False, None, False, [1, 2, 1], [(2, 4, 6), (2, 0, 0)]), (True, True, False, [1, 1, 2], [(2, 6, 4), (2, 0, 0)]), (True, False, False, [2, 1, 1], [(4, 2, 6), (0, 2, 0)]), - (True, False, False, [2, 1, 1], [(4, 2, 6), (0, 2, 0)]), (True, False, True, [0.5, 0.25, 0.25], [(4, 2, 6), (0, 2, 0)]), ], ) @@ -225,6 +224,7 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize("normalize", [False, True]) def test_categorical(education_df, as_index, observed, normalize): + # Test catagorical data whether or not observed gp = education_df.astype("category").groupby( "country", as_index=as_index, observed=observed ) From 71ee5f449418c383dd5db9ec2a0084890a098745 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 23:24:07 +0000 Subject: [PATCH 16/77] Add by=function test --- .../tests/groupby/test_frame_value_counts.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 06f7432e698e6..551155a797aaa 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -24,15 +24,11 @@ def education_df(): ) -@pytest.fixture -def country_index(): - return ["U", "F", "U", "F", "F", "F"] - - def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) +@pytest.mark.parametrize("groupby", ["column", "array", "function"]) @pytest.mark.parametrize("column", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( @@ -46,16 +42,22 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_basic( - education_df, country_index, column, normalize, sort, ascending, as_index, frame + education_df, groupby, column, normalize, sort, ascending, as_index, frame ): # gh43564 with added: - # - Use column or index + # - Use column, array or function as by= parameter # - Whether or not to normalize # - Whether or not to sort and how # - Whether or not to use the groupby as an index # - 3-way compare against :meth:`~DataFrame.value_counts` # and `~SeriesGroupBy.value_counts` - gp = education_df.groupby("country" if column else country_index, as_index=as_index) + by = { + "column": "country", + "array": education_df["country"].values, + "function": lambda x: education_df["country"][x] == "US", + }[groupby] + + gp = education_df.groupby(by=by, as_index=as_index) result = gp[["gender", "education"]].value_counts( normalize=normalize, sort=sort, ascending=ascending ) @@ -71,7 +73,7 @@ def test_basic( tm.assert_series_equal(result, expected) else: tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected.values) - elif column or as_index: + elif groupby == "column" or as_index: # (otherwise SeriesGroupby crashes) # compare against SeriesGroupBy value_counts education_df["both"] = education_df["gender"] + "-" + education_df["education"] From 1dd2db06ca589765ece381a140dcbbe52988076f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 23:26:42 +0000 Subject: [PATCH 17/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 551155a797aaa..cd8d5f25fe511 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -29,7 +29,6 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("groupby", ["column", "array", "function"]) -@pytest.mark.parametrize("column", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending", @@ -42,7 +41,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_basic( - education_df, groupby, column, normalize, sort, ascending, as_index, frame + education_df, groupby, normalize, sort, ascending, as_index, frame ): # gh43564 with added: # - Use column, array or function as by= parameter From 1c18d7d87f93970e66e5d925107fa43ea1d24f90 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 23:30:43 +0000 Subject: [PATCH 18/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index cd8d5f25fe511..1bf29dd1d34e0 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -225,7 +225,7 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("observed", [False, True]) @pytest.mark.parametrize("normalize", [False, True]) def test_categorical(education_df, as_index, observed, normalize): - # Test catagorical data whether or not observed + # Test categorical data whether or not observed gp = education_df.astype("category").groupby( "country", as_index=as_index, observed=observed ) From f25e86182fad300a5717cef90dab3e87d6d6bcfc Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 23:33:58 +0000 Subject: [PATCH 19/77] Update generic.py --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 288c6bf81ba46..5ccf8d805a184 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1722,7 +1722,7 @@ def value_counts( result.rename({"size": RESULT_NAME}, axis=1, inplace=True) if normalize: - # Normalize the results be dividing by the original group sizes + # Normalize the results by dividing by the original group sizes indexed_group_size = df.groupby( main_grouper, sort=self.sort, observed=self.observed, dropna=dropna ).size() From faac0f061a202026c6be6f1195c900be2b0acb7b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 3 Nov 2021 23:50:14 +0000 Subject: [PATCH 20/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 1bf29dd1d34e0..d370b2a98c91b 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -40,9 +40,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) -def test_basic( - education_df, groupby, normalize, sort, ascending, as_index, frame -): +def test_basic(education_df, groupby, normalize, sort, ascending, as_index, frame): # gh43564 with added: # - Use column, array or function as by= parameter # - Whether or not to normalize From 0f615da04d19af4ac20735a4eedd803ba1e074cd Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 5 Nov 2021 09:20:50 +0000 Subject: [PATCH 21/77] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5d5914c864947..19eca9e341d71 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -183,7 +183,6 @@ Other enhancements - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) -- - :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) .. --------------------------------------------------------------------------- From 50d4c5990a524bd336b9ab16694d94ce26bd2de6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 14 Nov 2021 13:04:32 +0000 Subject: [PATCH 22/77] Reset index after sorting --- pandas/core/groupby/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fa64a4e3ac395..0f0063f3a0180 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1784,6 +1784,7 @@ def value_counts( cast(DataFrame, result) .sort_values(by=RESULT_NAME, ascending=ascending) .sort_values(by=by, ascending=True) + .reset_index(drop=True) ) return result From 9b2869f5b4371b21dfc9402e0210a2b804db2a5e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 14 Nov 2021 20:56:36 +0000 Subject: [PATCH 23/77] Toughen up testing for groupers in keys --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7577b1e671d60..c24ceb0ffd50c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -800,7 +800,7 @@ def get_grouper( # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_groupers = any(isinstance(g, Grouper) for g in keys) + any_groupers = any(isinstance(g, Grouper) or isinstance(g, Grouping) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys ) From 3de613236d6cadb8894148f0d441599180be9b33 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 14 Nov 2021 20:57:47 +0000 Subject: [PATCH 24/77] De-numpy most of the tests --- .../tests/groupby/test_frame_value_counts.py | 128 ++++++++++++------ 1 file changed, 83 insertions(+), 45 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index d370b2a98c91b..2000b4095562c 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -8,7 +8,7 @@ def test_axis(animals_df): - gp = animals_df.groupby([0, 0], axis=1) + gp = animals_df.groupby([0, 0, 0], axis=1) with pytest.raises(NotImplementedError, match="axis"): gp.value_counts() @@ -24,6 +24,25 @@ def education_df(): ) +def test_basic(education_df): + # gh43564 + result = education_df.groupby('country')[['gender', 'education']].value_counts(normalize=True) + expected = pd.Series( + name="count", + data=[0.5, 0.25, 0.25, 0.5, 0.5], + index=pd.MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + names=["country", "gender", "education"], + ), + ) + tm.assert_series_equal(result, expected) + def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) @@ -40,14 +59,17 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): ) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) -def test_basic(education_df, groupby, normalize, sort, ascending, as_index, frame): - # gh43564 with added: +def test_against_frame_and_seriesgroupby( + education_df, groupby, normalize, sort, ascending, as_index, frame +): + # test all parameters: # - Use column, array or function as by= parameter # - Whether or not to normalize # - Whether or not to sort and how # - Whether or not to use the groupby as an index - # - 3-way compare against :meth:`~DataFrame.value_counts` - # and `~SeriesGroupBy.value_counts` + # - 3-way compare against: + # - apply with :meth:`~DataFrame.value_counts` + # - `~SeriesGroupBy.value_counts` (apart from certain cases where it crashes) by = { "column": "country", "array": education_df["country"].values, @@ -58,7 +80,6 @@ def test_basic(education_df, groupby, normalize, sort, ascending, as_index, fram result = gp[["gender", "education"]].value_counts( normalize=normalize, sort=sort, ascending=ascending ) - if frame: # compare against apply with DataFrame value_counts expected = gp.apply( @@ -69,7 +90,15 @@ def test_basic(education_df, groupby, normalize, sort, ascending, as_index, fram if as_index: tm.assert_series_equal(result, expected) else: - tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected.values) + expected = expected.reset_index().rename({0: "count"}, axis=1) + if groupby == "column": + expected = expected.rename({"level_0": "country"}, axis=1) + expected["country"] = np.where(expected["country"], "US", "FR") + elif groupby == "function": + expected["level_0"] = expected["level_0"] == 1 + else: + expected["level_0"] = np.where(expected["level_0"], "US", "FR") + tm.assert_frame_equal(result, expected) elif groupby == "column" or as_index: # (otherwise SeriesGroupby crashes) # compare against SeriesGroupBy value_counts @@ -78,46 +107,57 @@ def test_basic(education_df, groupby, normalize, sort, ascending, as_index, fram normalize=normalize, sort=sort, ascending=ascending ) if as_index: - tm.assert_numpy_array_equal(result.values, expected.values) + index_frame = expected.index.to_frame(index=False) + index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) + index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + del index_frame["both"] + index_frame = index_frame.rename({0: None}, axis=1) + expected.index = pd.MultiIndex.from_frame(index_frame) + expected.name = RESULT_NAME + tm.assert_series_equal(result, expected) else: - tm.assert_numpy_array_equal( - result[RESULT_NAME].values, expected[RESULT_NAME].values - ) + expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) + expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + del expected["both"] + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( - "sort, ascending", + "sort, ascending, expected_rows, expected_count, expected_group_size", [ - (False, None), - (True, True), - (True, False), + (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), + (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), + (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), ], ) -def test_compound(education_df, normalize, sort, ascending): +def test_compound( + education_df, + normalize, + sort, + ascending, + expected_rows, + expected_count, + expected_group_size, +): # Multiple groupby keys and as_index=False gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) result = gp["education"].value_counts( normalize=normalize, sort=sort, ascending=ascending ) - - if sort: - # compare against apply with DataFrame value_counts - expected_values = gp.apply( - _frame_value_counts, "education", normalize, sort, ascending - ).values - elif normalize: - expected_values = np.array([1.0, 1.0 / 3, 1.0, 2.0 / 3, 1.0]) - else: - expected_values = np.array([1, 1, 1, 2, 1], dtype=np.int64) - - tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected_values) + expected = pd.DataFrame() + for column in ["country", "gender", "education"]: + expected[column] = [education_df[column][row] for row in expected_rows] + expected["count"] = expected_count + if normalize: + expected["count"] /= expected_group_size + tm.assert_frame_equal(result, expected) @pytest.fixture def animals_df(): return pd.DataFrame( - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) @@ -125,10 +165,10 @@ def animals_df(): @pytest.mark.parametrize( "sort, ascending, normalize, expected_data, expected_index", [ - (False, None, False, [1, 2, 1], [(2, 4, 6), (2, 0, 0)]), - (True, True, False, [1, 1, 2], [(2, 6, 4), (2, 0, 0)]), - (True, False, False, [2, 1, 1], [(4, 2, 6), (0, 2, 0)]), - (True, False, True, [0.5, 0.25, 0.25], [(4, 2, 6), (0, 2, 0)]), + (False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + (True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), ], ) def test_data_frame_value_counts( @@ -142,18 +182,16 @@ def test_data_frame_value_counts( expected = pd.Series( data=expected_data, index=pd.MultiIndex.from_arrays( - expected_index, names=["num_legs", "num_wings"] + expected_index, names=["key", "num_legs", "num_wings"] ), ) tm.assert_series_equal(result_frame, expected) - animals_df["key"] = 1 - + expected.name = RESULT_NAME result_frame_groupby = animals_df.groupby("key").value_counts( sort=sort, ascending=ascending, normalize=normalize ) - result_frame_groupby.reset_index(drop=True, level="key", inplace=True) - result_frame_groupby.name = None + tm.assert_series_equal(result_frame_groupby, expected) @@ -161,6 +199,7 @@ def test_data_frame_value_counts( def names_with_nulls_df(nulls_fixture): return pd.DataFrame( { + "key": [1, 1, 1, 1], "first_name": ["John", "Anne", "John", "Beth"], "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], }, @@ -174,8 +213,8 @@ def names_with_nulls_df(nulls_fixture): True, [1, 1], pd.MultiIndex.from_arrays( - [("Beth", "John"), ("Louise", "Smith")], - names=["first_name", "middle_name"], + [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + names=["key", "first_name", "middle_name"], ), ), ( @@ -183,11 +222,12 @@ def names_with_nulls_df(nulls_fixture): [1, 1, 1, 1], pd.MultiIndex( levels=[ + pd.Index([1]), pd.Index(["Anne", "Beth", "John"]), pd.Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 1, 2, 2], [2, 0, 1, 2]], - names=["first_name", "middle_name"], + codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + names=["key", "first_name", "middle_name"], ), ), ], @@ -209,12 +249,10 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame, expected) - names_with_nulls_df["key"] = 1 + expected.name = RESULT_NAME result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( dropna=dropna, normalize=normalize ) - result_frame_groupby.reset_index(drop=True, level="key", inplace=True) - result_frame_groupby.name = None tm.assert_series_equal(result_frame_groupby, expected) From a9c2b83f1ad3f8f30ee1cae18625abb5b78b01fc Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 00:52:50 +0000 Subject: [PATCH 25/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 2000b4095562c..a06e35d9a973c 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -26,7 +26,9 @@ def education_df(): def test_basic(education_df): # gh43564 - result = education_df.groupby('country')[['gender', 'education']].value_counts(normalize=True) + result = education_df.groupby("country")[["gender", "education"]].value_counts( + normalize=True + ) expected = pd.Series( name="count", data=[0.5, 0.25, 0.25, 0.5, 0.5], @@ -43,6 +45,7 @@ def test_basic(education_df): ) tm.assert_series_equal(result, expected) + def _frame_value_counts(df, keys, normalize, sort, ascending): return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) From 6905bcdf136bf0f94da768c2611c038996643776 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 14:36:23 +0000 Subject: [PATCH 26/77] Better detection of non-column grouping --- pandas/core/groupby/generic.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a98c632530267..8226e1d79df4f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1656,11 +1656,11 @@ def value_counts( with self._group_selection_context(): df = self.obj - # Check for index rather than column grouping - index_grouping = self.grouper.groupings[0].name is None + # Check for mapping, array or function rather than column grouping + non_column_grouping = not self.grouper.groupings[0].in_axis # Try to find column names - if index_grouping: + if non_column_grouping: keys: Any = [] remaining_columns = self._selected_obj.columns elif isinstance(self._selected_obj, Series): @@ -1728,18 +1728,18 @@ def value_counts( main_grouper, sort=self.sort, observed=self.observed, dropna=dropna ).size() if self.as_index: - if index_grouping: + if non_column_grouping: # The common index needs a common name indexed_group_size.index.set_names("Group", inplace=True) result.index.set_names("Group", level=0, inplace=True) # Use indexed group size series result /= indexed_group_size - if index_grouping: + if non_column_grouping: result.index.set_names(None, level=0, inplace=True) else: # Make indexed key group size series indexed_group_size.name = "group_size" - if index_grouping: + if non_column_grouping: # Get the column name of the added groupby index column index_column_name = result.columns[0] indexed_group_size.index.set_names( @@ -1748,7 +1748,7 @@ def value_counts( left_on = index_column_name else: left_on = keys - if not index_grouping and len(keys) == 1: + if not non_column_grouping and len(keys) == 1: # Compose with single key group size series group_size = indexed_group_size[result[keys[0]]] else: @@ -1766,7 +1766,7 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping if self.as_index: - if index_grouping: + if non_column_grouping: level: Any = 0 else: level = keys @@ -1776,7 +1776,7 @@ def value_counts( .sort_index(level=level, sort_remaining=False) ) else: - if index_grouping: + if non_column_grouping: by: Any = "level_0" else: by = keys From eb9600f8af853de014a1913e8d7d53beadd6a750 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 20:02:44 +0000 Subject: [PATCH 27/77] Finish de-numpying the tests --- .../tests/groupby/test_frame_value_counts.py | 87 +++++++++++++------ 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index a06e35d9a973c..a8e1b5e845c74 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -261,35 +261,72 @@ def test_data_frame_value_counts_dropna( @pytest.mark.parametrize("as_index", [False, True]) -@pytest.mark.parametrize("observed", [False, True]) -@pytest.mark.parametrize("normalize", [False, True]) -def test_categorical(education_df, as_index, observed, normalize): +@pytest.mark.parametrize( + "observed, expected_index", + [ + ( + False, + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + ), + ( + True, + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + ), + ], +) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical( + education_df, as_index, observed, expected_index, normalize, expected_data +): # Test categorical data whether or not observed gp = education_df.astype("category").groupby( "country", as_index=as_index, observed=observed ) result = gp.value_counts(normalize=normalize) - if observed: - gp = education_df.groupby("country", as_index=as_index) - expected = gp.value_counts(normalize=normalize) - if as_index: - tm.assert_numpy_array_equal(result.values, expected.values) - else: - tm.assert_numpy_array_equal( - result[RESULT_NAME].values, expected[RESULT_NAME].values - ) - else: - if normalize: - expected_values = np.array( - [0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0] - ) - else: - expected_values = np.array( - [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64 - ) + expected_series = pd.Series( + name="count", + data=expected_data[expected_data > 0.0] if observed else expected_data, + index=pd.MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + pd.CategoricalIndex(expected_series.index.levels[i]), level=i + ) - if as_index: - tm.assert_numpy_array_equal(result.values, expected_values) - else: - tm.assert_numpy_array_equal(result[RESULT_NAME].values, expected_values) + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.to_frame().reset_index() + tm.assert_frame_equal(result, expected) From 0ae521804a52858974e95c7ae7922b24ad3b0886 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 21:00:45 +0000 Subject: [PATCH 28/77] Dropna changes --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8226e1d79df4f..9c81d593894ef 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1692,7 +1692,7 @@ def value_counts( level=self.level, sort=self.sort, mutated=self.mutated, - dropna=dropna, + dropna=self.dropna, ) groupings = list(main_grouper.groupings) for key in remaining_columns: @@ -1703,7 +1703,7 @@ def value_counts( level=self.level, sort=self.sort, mutated=self.mutated, - dropna=dropna, + dropna=False, ) groupings += list(grouper.groupings) @@ -1713,7 +1713,7 @@ def value_counts( as_index=self.as_index, sort=self.sort, observed=self.observed, - dropna=dropna, + dropna=False, ).size() # Change the nameof the size result From dfa82cba80efbf645fd67d71c9874e9a00caf686 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 22:24:44 +0000 Subject: [PATCH 29/77] Update generic.py --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9c81d593894ef..000998ccf2811 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1703,6 +1703,7 @@ def value_counts( level=self.level, sort=self.sort, mutated=self.mutated, + # nulls have already been dropped from remaining_columns dropna=False, ) groupings += list(grouper.groupings) @@ -1713,7 +1714,7 @@ def value_counts( as_index=self.as_index, sort=self.sort, observed=self.observed, - dropna=False, + dropna=self.dropna, ).size() # Change the nameof the size result From 6e2b06e836a2c5c8de8b58f3add12ca40d4e0472 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 22:40:00 +0000 Subject: [PATCH 30/77] Add bad subset trap and test --- pandas/core/groupby/generic.py | 7 ++++++- .../tests/groupby/test_frame_value_counts.py | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 000998ccf2811..cc536aa246b44 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1679,7 +1679,12 @@ def value_counts( ] if subset is not None: - remaining_columns = [key for key in subset if key not in keys] + for key in subset: + if key in keys: + raise ValueError( + f"Key {key} in subset cannot be one of the groupby column keys" + ) + remaining_columns = subset if dropna: df = df.dropna(subset=remaining_columns, axis="index", how="any") diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index a8e1b5e845c74..4b0788dc1fc83 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -7,12 +7,6 @@ RESULT_NAME = "count" -def test_axis(animals_df): - gp = animals_df.groupby([0, 0, 0], axis=1) - with pytest.raises(NotImplementedError, match="axis"): - gp.value_counts() - - @pytest.fixture def education_df(): return pd.DataFrame( @@ -24,6 +18,18 @@ def education_df(): ) +def test_axis(education_df): + gp = education_df.groupby("country", axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() + + +def test_bad_subset(education_df): + gp = education_df.groupby("country") + with pytest.raises(ValueError, match="subset"): + gp.value_counts(subset=["country"]) + + def test_basic(education_df): # gh43564 result = education_df.groupby("country")[["gender", "education"]].value_counts( From 2dc597294dcecb694d829591ff4d3f4803c08cb4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 15 Nov 2021 23:27:44 +0000 Subject: [PATCH 31/77] Update generic.py --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cc536aa246b44..d42ae969a3098 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1682,7 +1682,8 @@ def value_counts( for key in subset: if key in keys: raise ValueError( - f"Key {key} in subset cannot be one of the groupby column keys" + f"Key {key} in subset cannot be one of " + "the groupby column keys" ) remaining_columns = subset From 8d8d9b02d7039eaba89426f84345e8aeb6baa207 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 20 Nov 2021 19:52:40 +0000 Subject: [PATCH 32/77] Add more dropna tests and workaround Series bug --- pandas/core/groupby/generic.py | 12 ++++++-- .../tests/groupby/test_frame_value_counts.py | 30 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f079b2760df81..69bb453e03bb6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1732,7 +1732,10 @@ def value_counts( if normalize: # Normalize the results by dividing by the original group sizes indexed_group_size = df.groupby( - main_grouper, sort=self.sort, observed=self.observed, dropna=dropna + main_grouper, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, ).size() if self.as_index: if non_column_grouping: @@ -1740,7 +1743,12 @@ def value_counts( indexed_group_size.index.set_names("Group", inplace=True) result.index.set_names("Group", level=0, inplace=True) # Use indexed group size series - result /= indexed_group_size + if self.dropna or len(self.keys) == 1: + result /= indexed_group_size + else: + # Unfortunately, nans in multiindex seem to break Pandas alignment + values = result.values / indexed_group_size.align(result, join="left")[0].values + result = Series(data=values, index = result.index, name=RESULT_NAME) if non_column_grouping: result.index.set_names(None, level=0, inplace=True) else: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 4b0788dc1fc83..68228160f2b0a 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -204,6 +204,36 @@ def test_data_frame_value_counts( tm.assert_series_equal(result_frame_groupby, expected) +@pytest.fixture +def nulls_df(): + n = np.nan + return pd.DataFrame({ + "A": [1, 1, n, 4, n, 6, 6, 6, 6], + "B": [1, 1, 3, n, n, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, n, 8, n], + "D": [1, 2, 3, 4, 5, 6, 7, n, n], + }) + + +@pytest.mark.parametrize( + "group_dropna, count_dropna, expected_rows, expected_values", [ + (False, False, [0, 1, 3, 5, 7, 6, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0]), + (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), + (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), + ] +) +def test_dropna_combinations(nulls_df, group_dropna, count_dropna, expected_rows, expected_values): + gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) + result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) + columns = pd.DataFrame() + for column in nulls_df.columns: + columns[column] = [nulls_df[column][row] for row in expected_rows] + index = pd.MultiIndex.from_frame(columns) + expected = pd.Series(data=expected_values, index=index, name=RESULT_NAME) + tm.assert_series_equal(result, expected) + + @pytest.fixture def names_with_nulls_df(nulls_fixture): return pd.DataFrame( From c43195326c04c932e34023813075d96979ce0805 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 20 Nov 2021 20:31:01 +0000 Subject: [PATCH 33/77] Reformat --- pandas/core/groupby/generic.py | 9 ++++-- .../tests/groupby/test_frame_value_counts.py | 30 ++++++++++++------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69bb453e03bb6..8c7cd0b31e64c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1747,8 +1747,13 @@ def value_counts( result /= indexed_group_size else: # Unfortunately, nans in multiindex seem to break Pandas alignment - values = result.values / indexed_group_size.align(result, join="left")[0].values - result = Series(data=values, index = result.index, name=RESULT_NAME) + values = ( + result.values + / indexed_group_size.align(result, join="left")[0].values + ) + result = Series( + data=values, index=result.index, name=RESULT_NAME + ) if non_column_grouping: result.index.set_names(None, level=0, inplace=True) else: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 68228160f2b0a..370a6f4821434 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -207,23 +207,33 @@ def test_data_frame_value_counts( @pytest.fixture def nulls_df(): n = np.nan - return pd.DataFrame({ - "A": [1, 1, n, 4, n, 6, 6, 6, 6], - "B": [1, 1, 3, n, n, 6, 6, 6, 6], - "C": [1, 2, 3, 4, 5, 6, n, 8, n], - "D": [1, 2, 3, 4, 5, 6, 7, n, n], - }) + return pd.DataFrame( + { + "A": [1, 1, n, 4, n, 6, 6, 6, 6], + "B": [1, 1, 3, n, n, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, n, 8, n], + "D": [1, 2, 3, 4, 5, 6, 7, n, n], + } + ) @pytest.mark.parametrize( - "group_dropna, count_dropna, expected_rows, expected_values", [ - (False, False, [0, 1, 3, 5, 7, 6, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0]), + "group_dropna, count_dropna, expected_rows, expected_values", + [ + ( + False, + False, + [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], + ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), - ] + ], ) -def test_dropna_combinations(nulls_df, group_dropna, count_dropna, expected_rows, expected_values): +def test_dropna_combinations( + nulls_df, group_dropna, count_dropna, expected_rows, expected_values +): gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) columns = pd.DataFrame() From 4d10e4725884ceb9946e3380e8313fb9df9bd676 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 20 Nov 2021 23:32:47 +0000 Subject: [PATCH 34/77] Update generic.py --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8c7cd0b31e64c..b23a03002f8df 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1746,7 +1746,8 @@ def value_counts( if self.dropna or len(self.keys) == 1: result /= indexed_group_size else: - # Unfortunately, nans in multiindex seem to break Pandas alignment + # Unfortunately, nans in multiindex seem to break + # Pandas alignment values = ( result.values / indexed_group_size.align(result, join="left")[0].values From e4582ef61ee9a9b33102dde3500790bb84d73b6d Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 21 Nov 2021 11:28:34 +0000 Subject: [PATCH 35/77] Typing fix --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b23a03002f8df..dc21e7bbe206c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1743,7 +1743,7 @@ def value_counts( indexed_group_size.index.set_names("Group", inplace=True) result.index.set_names("Group", level=0, inplace=True) # Use indexed group size series - if self.dropna or len(self.keys) == 1: + if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): result /= indexed_group_size else: # Unfortunately, nans in multiindex seem to break From 2a58c42d6eca9401d8ad53da22cea69999797e5b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 21 Nov 2021 11:32:18 +0000 Subject: [PATCH 36/77] Update generic.py --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dc21e7bbe206c..1f03ac338f169 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1743,7 +1743,9 @@ def value_counts( indexed_group_size.index.set_names("Group", inplace=True) result.index.set_names("Group", level=0, inplace=True) # Use indexed group size series - if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): + if self.dropna or ( + isinstance(self.keys, list) and len(self.keys) == 1 + ): result /= indexed_group_size else: # Unfortunately, nans in multiindex seem to break From f179fbb7235cecd1dc062444f025816d7c503c8e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 22 Nov 2021 23:27:26 +0000 Subject: [PATCH 37/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 370a6f4821434..fac158ab5b23b 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -374,5 +374,5 @@ def test_categorical( if as_index: tm.assert_series_equal(result, expected_series) else: - expected = expected_series.to_frame().reset_index() + expected = expected_series.reset_index() tm.assert_frame_equal(result, expected) From 98355d530031951be8dd59c08c14e0b10323c8ab Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 23 Nov 2021 10:22:35 +0000 Subject: [PATCH 38/77] Replace self.as_index==False code with reset_index() --- pandas/core/groupby/generic.py | 102 +++++++++------------------------ 1 file changed, 28 insertions(+), 74 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f03ac338f169..45185ca0dc421 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1717,17 +1717,11 @@ def value_counts( # Take the size of the overall columns result = df.groupby( groupings, - as_index=self.as_index, sort=self.sort, observed=self.observed, dropna=self.dropna, ).size() - - # Change the nameof the size result - if self.as_index: - result.name = RESULT_NAME - else: - result.rename({"size": RESULT_NAME}, axis=1, inplace=True) + result.name = RESULT_NAME if normalize: # Normalize the results by dividing by the original group sizes @@ -1737,79 +1731,39 @@ def value_counts( observed=self.observed, dropna=self.dropna, ).size() - if self.as_index: - if non_column_grouping: - # The common index needs a common name - indexed_group_size.index.set_names("Group", inplace=True) - result.index.set_names("Group", level=0, inplace=True) - # Use indexed group size series - if self.dropna or ( - isinstance(self.keys, list) and len(self.keys) == 1 - ): - result /= indexed_group_size - else: - # Unfortunately, nans in multiindex seem to break - # Pandas alignment - values = ( - result.values - / indexed_group_size.align(result, join="left")[0].values - ) - result = Series( - data=values, index=result.index, name=RESULT_NAME - ) - if non_column_grouping: - result.index.set_names(None, level=0, inplace=True) + if non_column_grouping: + # The common index needs a common name + indexed_group_size.index.set_names("Group", inplace=True) + result.index.set_names("Group", level=0, inplace=True) + # Use indexed group size series + if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): + result /= indexed_group_size else: - # Make indexed key group size series - indexed_group_size.name = "group_size" - if non_column_grouping: - # Get the column name of the added groupby index column - index_column_name = result.columns[0] - indexed_group_size.index.set_names( - index_column_name, inplace=True - ) - left_on = index_column_name - else: - left_on = keys - if not non_column_grouping and len(keys) == 1: - # Compose with single key group size series - group_size = indexed_group_size[result[keys[0]]] - else: - # Merge multiple key group size series - merged = result.merge( - indexed_group_size, - how="left", - left_on=left_on, - right_index=True, - ) - group_size = merged["group_size"] - - result[RESULT_NAME] /= group_size.values + # Unfortunately, nans in multi-column multiindex sometimes makes + # size() produce a Series that puts nans in the result + values = ( + result.values + / indexed_group_size.align(result, join="left")[0].values + ) + result = Series(data=values, index=result.index, name=RESULT_NAME) + if non_column_grouping: + result.index.set_names(None, level=0, inplace=True) if sort: # Sort the values and then resort by the main grouping - if self.as_index: - if non_column_grouping: - level: Any = 0 - else: - level = keys - result = ( - cast(Series, result) - .sort_values(ascending=ascending) - .sort_index(level=level, sort_remaining=False) - ) + if non_column_grouping: + level: Any = 0 else: - if non_column_grouping: - by: Any = "level_0" - else: - by = keys - result = ( - cast(DataFrame, result) - .sort_values(by=RESULT_NAME, ascending=ascending) - .sort_values(by=by, ascending=True) - .reset_index(drop=True) - ) + level = keys + result = ( + cast(Series, result) + .sort_values(ascending=ascending) + .sort_index(level=level, sort_remaining=False) + ) + if not self.as_index: + # Convert to frame + result = result.reset_index() return result From fada9a9c16a68a2f99545c514c686ae618f2f205 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 25 Nov 2021 17:22:10 +0000 Subject: [PATCH 39/77] Remove Series name and change column name --- pandas/core/groupby/generic.py | 8 +++---- .../tests/groupby/test_frame_value_counts.py | 22 ++++++++----------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 45185ca0dc421..c2fae4b4e2c76 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1603,7 +1603,7 @@ def value_counts( Returns ------- Series or DataFrame - Series if as_index is True, otherwise DataFrame. + Series if the groupby as_index is True, otherwise DataFrame. See Also -------- @@ -1647,7 +1647,6 @@ def value_counts( medium FR 0.25 Name: count, dtype: float64 """ - RESULT_NAME = "count" if self.axis == 1: raise NotImplementedError( "DataFrameGroupBy.value_counts only handles axis=0" @@ -1721,7 +1720,6 @@ def value_counts( observed=self.observed, dropna=self.dropna, ).size() - result.name = RESULT_NAME if normalize: # Normalize the results by dividing by the original group sizes @@ -1745,7 +1743,7 @@ def value_counts( result.values / indexed_group_size.align(result, join="left")[0].values ) - result = Series(data=values, index=result.index, name=RESULT_NAME) + result = Series(data=values, index=result.index) if non_column_grouping: result.index.set_names(None, level=0, inplace=True) @@ -1763,7 +1761,7 @@ def value_counts( if not self.as_index: # Convert to frame - result = result.reset_index() + result = result.reset_index(name="proportion" if normalize else "count") return result diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index fac158ab5b23b..acdffc990654d 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -4,8 +4,6 @@ import pandas as pd import pandas._testing as tm -RESULT_NAME = "count" - @pytest.fixture def education_df(): @@ -36,7 +34,6 @@ def test_basic(education_df): normalize=True ) expected = pd.Series( - name="count", data=[0.5, 0.25, 0.25, 0.5, 0.5], index=pd.MultiIndex.from_tuples( [ @@ -95,11 +92,11 @@ def test_against_frame_and_seriesgroupby( _frame_value_counts, ["gender", "education"], normalize, sort, ascending ) - expected.name = RESULT_NAME if as_index: tm.assert_series_equal(result, expected) else: - expected = expected.reset_index().rename({0: "count"}, axis=1) + name = "proportion" if normalize else "count" + expected = expected.reset_index().rename({0: name}, axis=1) if groupby == "column": expected = expected.rename({"level_0": "country"}, axis=1) expected["country"] = np.where(expected["country"], "US", "FR") @@ -115,6 +112,7 @@ def test_against_frame_and_seriesgroupby( expected = gp["both"].value_counts( normalize=normalize, sort=sort, ascending=ascending ) + expected.name = None if as_index: index_frame = expected.index.to_frame(index=False) index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) @@ -122,7 +120,6 @@ def test_against_frame_and_seriesgroupby( del index_frame["both"] index_frame = index_frame.rename({0: None}, axis=1) expected.index = pd.MultiIndex.from_frame(index_frame) - expected.name = RESULT_NAME tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) @@ -157,9 +154,11 @@ def test_compound( expected = pd.DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] - expected["count"] = expected_count if normalize: - expected["count"] /= expected_group_size + expected["proportion"] = expected_count + expected["proportion"] /= expected_group_size + else: + expected["count"] = expected_count tm.assert_frame_equal(result, expected) @@ -196,7 +195,6 @@ def test_data_frame_value_counts( ) tm.assert_series_equal(result_frame, expected) - expected.name = RESULT_NAME result_frame_groupby = animals_df.groupby("key").value_counts( sort=sort, ascending=ascending, normalize=normalize ) @@ -240,7 +238,7 @@ def test_dropna_combinations( for column in nulls_df.columns: columns[column] = [nulls_df[column][row] for row in expected_rows] index = pd.MultiIndex.from_frame(columns) - expected = pd.Series(data=expected_values, index=index, name=RESULT_NAME) + expected = pd.Series(data=expected_values, index=index) tm.assert_series_equal(result, expected) @@ -298,7 +296,6 @@ def test_data_frame_value_counts_dropna( tm.assert_series_equal(result_frame, expected) - expected.name = RESULT_NAME result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( dropna=dropna, normalize=normalize ) @@ -359,7 +356,6 @@ def test_categorical( result = gp.value_counts(normalize=normalize) expected_series = pd.Series( - name="count", data=expected_data[expected_data > 0.0] if observed else expected_data, index=pd.MultiIndex.from_tuples( expected_index, @@ -374,5 +370,5 @@ def test_categorical( if as_index: tm.assert_series_equal(result, expected_series) else: - expected = expected_series.reset_index() + expected = expected_series.reset_index(name="proportion" if normalize else "count") tm.assert_frame_equal(result, expected) From 8e3f3599738abdeb7d9ad43b12c5d70c290d0103 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 25 Nov 2021 17:36:01 +0000 Subject: [PATCH 40/77] Change non_column_grouping --- pandas/core/groupby/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c2fae4b4e2c76..f997b1c569d3f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1656,10 +1656,10 @@ def value_counts( df = self.obj # Check for mapping, array or function rather than column grouping - non_column_grouping = not self.grouper.groupings[0].in_axis + in_axis = self.grouper.groupings[0].in_axis # Try to find column names - if non_column_grouping: + if not in_axis: keys: Any = [] remaining_columns = self._selected_obj.columns elif isinstance(self._selected_obj, Series): @@ -1729,7 +1729,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ).size() - if non_column_grouping: + if not in_axis: # The common index needs a common name indexed_group_size.index.set_names("Group", inplace=True) result.index.set_names("Group", level=0, inplace=True) @@ -1744,12 +1744,12 @@ def value_counts( / indexed_group_size.align(result, join="left")[0].values ) result = Series(data=values, index=result.index) - if non_column_grouping: + if not in_axis: result.index.set_names(None, level=0, inplace=True) if sort: # Sort the values and then resort by the main grouping - if non_column_grouping: + if not in_axis: level: Any = 0 else: level = keys From ca1593750fa39c58fa7dbfa6279a380566bcdc23 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 25 Nov 2021 17:38:30 +0000 Subject: [PATCH 41/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index acdffc990654d..0b449eba3ca3e 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -370,5 +370,7 @@ def test_categorical( if as_index: tm.assert_series_equal(result, expected_series) else: - expected = expected_series.reset_index(name="proportion" if normalize else "count") + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) tm.assert_frame_equal(result, expected) From f055323020dae35b825f4659110f03a80c9deb74 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 25 Nov 2021 23:48:26 +0000 Subject: [PATCH 42/77] Update generic.py --- pandas/core/groupby/generic.py | 38 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f997b1c569d3f..b2848e46fe7ca 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1613,12 +1613,15 @@ def value_counts( Notes ----- - If the groupby as_index is True then the returned Series will have a - MultiIndex with one level per input column. - If the groupby as_index is False then the returned DataFrame will have an - additional column with the value_counts. + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. + By default, rows that contain any NA values are omitted from - the result. By default, the result will be in descending order so that the + the result. + + By default, the result will be in descending order so that the first element of each group is the most frequently-occurring row. Examples @@ -1714,12 +1717,15 @@ def value_counts( groupings += list(grouper.groupings) # Take the size of the overall columns - result = df.groupby( - groupings, - sort=self.sort, - observed=self.observed, - dropna=self.dropna, - ).size() + result = cast( + Series, + df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ).size(), + ) if normalize: # Normalize the results by dividing by the original group sizes @@ -1731,8 +1737,8 @@ def value_counts( ).size() if not in_axis: # The common index needs a common name - indexed_group_size.index.set_names("Group", inplace=True) - result.index.set_names("Group", level=0, inplace=True) + indexed_group_size.index.set_names("_group_", inplace=True) + result.index.set_names("_group_", level=0, inplace=True) # Use indexed group size series if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): result /= indexed_group_size @@ -1753,10 +1759,8 @@ def value_counts( level: Any = 0 else: level = keys - result = ( - cast(Series, result) - .sort_values(ascending=ascending) - .sort_index(level=level, sort_remaining=False) + result = result.sort_values(ascending=ascending).sort_index( + level=level, sort_remaining=False ) if not self.as_index: From 86a0df631e9d821d85664345a581602c75e3fb80 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 26 Nov 2021 09:22:58 +0000 Subject: [PATCH 43/77] Correct docstring example --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b2848e46fe7ca..b8df9fdc0b47c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1648,7 +1648,7 @@ def value_counts( male low FR 0.50 US 0.25 medium FR 0.25 - Name: count, dtype: float64 + dtype: float64 """ if self.axis == 1: raise NotImplementedError( From c13eef058dcd6656d887c7945b5c79ccf38ba792 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 26 Nov 2021 16:56:10 +0000 Subject: [PATCH 44/77] Improve bad subset message --- pandas/core/groupby/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b8df9fdc0b47c..0bea1c7da5963 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1681,12 +1681,12 @@ def value_counts( ] if subset is not None: - for key in subset: - if key in keys: - raise ValueError( - f"Key {key} in subset cannot be one of " - "the groupby column keys" - ) + clashing = set(subset) & set(keys) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys" + ) remaining_columns = subset if dropna: From 76380862124fc49b4d63cafa40e621d1053c3a50 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 28 Nov 2021 01:02:52 +0000 Subject: [PATCH 45/77] Update generic.py --- pandas/core/groupby/generic.py | 80 +++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0bea1c7da5963..64283b1a7c448 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -69,10 +69,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import ( - base, - ops, -) +from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _agg_template, @@ -1658,30 +1655,29 @@ def value_counts( with self._group_selection_context(): df = self.obj - # Check for mapping, array or function rather than column grouping - in_axis = self.grouper.groupings[0].in_axis - - # Try to find column names - if not in_axis: - keys: Any = [] - remaining_columns = self._selected_obj.columns - elif isinstance(self._selected_obj, Series): - keys = [grouping.name for grouping in self.grouper.groupings] - remaining_columns = [self._selected_obj.name] + grouping_info = [ + { + "level": i, + "name": grouping.name if grouping.in_axis else f"level_{i}", + "in_axis": grouping.in_axis, + } + for i, grouping in enumerate(self.grouper.groupings) + ] + in_axis_names = [ + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + ] + if isinstance(self._selected_obj, Series): + name = self._selected_obj.name + remaining_columns = [] if name in in_axis_names else [name] else: - if isinstance(self.keys, ops.BaseGrouper): - keys = [grouping.name for grouping in self.keys.groupings] - elif isinstance(self.keys, str): - keys = [self.keys] - else: - keys = self.keys - remaining_columns = [ - key for key in self._selected_obj.columns if key not in keys + name + for name in self._selected_obj.columns + if name not in in_axis_names ] if subset is not None: - clashing = set(subset) & set(keys) + clashing = set(subset) - set(remaining_columns) if clashing: raise ValueError( f"Keys {clashing} in subset cannot be in " @@ -1703,10 +1699,10 @@ def value_counts( dropna=self.dropna, ) groupings = list(main_grouper.groupings) - for key in remaining_columns: + for name in remaining_columns: grouper, _, _ = get_grouper( df, - key=key, + key=name, axis=self.axis, level=self.level, sort=self.sort, @@ -1727,6 +1723,13 @@ def value_counts( ).size(), ) + # Temporarily fill in index column names + for info in grouping_info: + if not info["in_axis"]: + result.index.set_names( + info["name"], level=info["level"], inplace=True + ) + if normalize: # Normalize the results by dividing by the original group sizes indexed_group_size = df.groupby( @@ -1735,10 +1738,15 @@ def value_counts( observed=self.observed, dropna=self.dropna, ).size() - if not in_axis: - # The common index needs a common name - indexed_group_size.index.set_names("_group_", inplace=True) - result.index.set_names("_group_", level=0, inplace=True) + + # Set the non-in_axis index names + for info in grouping_info: + if not info["in_axis"]: + level = info["level"] if len(grouping_info) > 1 else None + indexed_group_size.index.set_names( + info["name"], level=level, inplace=True + ) + # Use indexed group size series if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): result /= indexed_group_size @@ -1750,20 +1758,20 @@ def value_counts( / indexed_group_size.align(result, join="left")[0].values ) result = Series(data=values, index=result.index) - if not in_axis: - result.index.set_names(None, level=0, inplace=True) if sort: # Sort the values and then resort by the main grouping - if not in_axis: - level: Any = 0 - else: - level = keys + level = [info["level"] for info in grouping_info] result = result.sort_values(ascending=ascending).sort_index( level=level, sort_remaining=False ) - if not self.as_index: + if self.as_index: + # Remove Series index names (for compatability with size()) + for info in grouping_info: + if not info["in_axis"]: + result.index.set_names(None, level=info["level"], inplace=True) + else: # Convert to frame result = result.reset_index(name="proportion" if normalize else "count") return result From aa3cb98782ecb0e096222d50b6c9fc460d82f165 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 28 Nov 2021 09:12:56 +0000 Subject: [PATCH 46/77] Update generic.py --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 64283b1a7c448..cf7c60c7faf60 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1767,7 +1767,7 @@ def value_counts( ) if self.as_index: - # Remove Series index names (for compatability with size()) + # Remove Series index names (for compatibility with size()) for info in grouping_info: if not info["in_axis"]: result.index.set_names(None, level=info["level"], inplace=True) From 5e5d7e7fb46c1b0e097dc1f172e75ffe5aa92779 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 28 Nov 2021 10:51:48 +0000 Subject: [PATCH 47/77] Update generic.py --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cf7c60c7faf60..b6263a4927e4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1683,7 +1683,7 @@ def value_counts( f"Keys {clashing} in subset cannot be in " "the groupby column keys" ) - remaining_columns = subset + remaining_columns = list(subset) if dropna: df = df.dropna(subset=remaining_columns, axis="index", how="any") @@ -1761,9 +1761,9 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping - level = [info["level"] for info in grouping_info] + index_level = [info["level"] for info in grouping_info] result = result.sort_values(ascending=ascending).sort_index( - level=level, sort_remaining=False + level=index_level, sort_remaining=False ) if self.as_index: From 09cee2f767b9827af754fbbe87ea2c0466f7f3f7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 29 Nov 2021 00:36:42 +0000 Subject: [PATCH 48/77] Add mixed grouping test --- .../tests/groupby/test_frame_value_counts.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 0b449eba3ca3e..c7da03dbae1b8 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -374,3 +374,27 @@ def test_categorical( name="proportion" if normalize else "count" ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label, expected_values", + [ + (False, "count", [1, 1, 1]), + (True, "proportion", [0.5, 0.5, 1.0]), + ], +) +def test_mixed_groupings(normalize, expected_label, expected_values): + # Test multiple groupings + df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) + result = gp.value_counts(sort=True, normalize=normalize) + expected = pd.DataFrame( + { + "level_0": [4, 4, 5], + "A": [1, 1, 2], + "level_2": [8, 8, 7], + "B": [1, 3, 2], + expected_label: expected_values, + } + ) + tm.assert_frame_equal(result, expected) From 8f81bd2c46366711065743340d244421d3e5a7dd Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 29 Nov 2021 12:33:05 +0000 Subject: [PATCH 49/77] Trigger CI From 95ccdb44ab0ebe111e28452254906b9ae87996e3 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 29 Nov 2021 12:40:29 +0000 Subject: [PATCH 50/77] Trigger CI From 085e8c9b20480cbf2913700190fab2a54b3faebe Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 29 Nov 2021 19:28:33 -0500 Subject: [PATCH 51/77] Some refinements --- pandas/core/groupby/generic.py | 108 +++++++++------------------------ 1 file changed, 29 insertions(+), 79 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b6263a4927e4d..ee5cd92b5f948 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1655,123 +1655,73 @@ def value_counts( with self._group_selection_context(): df = self.obj - grouping_info = [ - { - "level": i, - "name": grouping.name if grouping.in_axis else f"level_{i}", - "in_axis": grouping.in_axis, - } - for i, grouping in enumerate(self.grouper.groupings) - ] - in_axis_names = [ + in_axis_names = { grouping.name for grouping in self.grouper.groupings if grouping.in_axis - ] + } if isinstance(self._selected_obj, Series): name = self._selected_obj.name - remaining_columns = [] if name in in_axis_names else [name] + keys = [] if name in in_axis_names else [self._selected_obj] else: - remaining_columns = [ - name - for name in self._selected_obj.columns + keys = [ + # TODO: Is there a better way to get the Series for the grouper? + self._selected_obj.iloc[:, idx] + for idx, name in enumerate(self._selected_obj.columns) if name not in in_axis_names ] if subset is not None: - clashing = set(subset) - set(remaining_columns) + clashing = set(subset) & set(in_axis_names) if clashing: raise ValueError( f"Keys {clashing} in subset cannot be in " "the groupby column keys" ) - remaining_columns = list(subset) - - if dropna: - df = df.dropna(subset=remaining_columns, axis="index", how="any") - # Add the remaining_column keys to the main grouper - main_grouper, _, _ = get_grouper( - df, - key=self.keys, - axis=self.axis, - level=self.level, - sort=self.sort, - mutated=self.mutated, - dropna=self.dropna, - ) - groupings = list(main_grouper.groupings) - for name in remaining_columns: + groupings = list(self.grouper.groupings) + for key in keys: grouper, _, _ = get_grouper( df, - key=name, + key=key, axis=self.axis, level=self.level, sort=self.sort, mutated=self.mutated, - # nulls have already been dropped from remaining_columns - dropna=False, + dropna=dropna, ) groupings += list(grouper.groupings) # Take the size of the overall columns - result = cast( - Series, - df.groupby( - groupings, - sort=self.sort, - observed=self.observed, - dropna=self.dropna, - ).size(), + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, ) - - # Temporarily fill in index column names - for info in grouping_info: - if not info["in_axis"]: - result.index.set_names( - info["name"], level=info["level"], inplace=True - ) + result = cast(Series, gb.size()) if normalize: - # Normalize the results by dividing by the original group sizes - indexed_group_size = df.groupby( - main_grouper, + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + # TODO: Is there a better way to subset levels from a MultiIndex? + levels = list(range(len(self.grouper.groupings), result.index.nlevels)) + indexed_group_size = result.groupby( + result.index.droplevel(levels), sort=self.sort, observed=self.observed, dropna=self.dropna, - ).size() - - # Set the non-in_axis index names - for info in grouping_info: - if not info["in_axis"]: - level = info["level"] if len(grouping_info) > 1 else None - indexed_group_size.index.set_names( - info["name"], level=level, inplace=True - ) + ).transform("sum") - # Use indexed group size series - if self.dropna or (isinstance(self.keys, list) and len(self.keys) == 1): - result /= indexed_group_size - else: - # Unfortunately, nans in multi-column multiindex sometimes makes - # size() produce a Series that puts nans in the result - values = ( - result.values - / indexed_group_size.align(result, join="left")[0].values - ) - result = Series(data=values, index=result.index) + result /= indexed_group_size if sort: # Sort the values and then resort by the main grouping - index_level = [info["level"] for info in grouping_info] + index_level = range(len(self.grouper.groupings)) result = result.sort_values(ascending=ascending).sort_index( level=index_level, sort_remaining=False ) - if self.as_index: - # Remove Series index names (for compatibility with size()) - for info in grouping_info: - if not info["in_axis"]: - result.index.set_names(None, level=info["level"], inplace=True) - else: + if not self.as_index: # Convert to frame result = result.reset_index(name="proportion" if normalize else "count") return result From bb5f82a220a4f3ad3454fa8c344037ab2e06c729 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 09:42:42 +0000 Subject: [PATCH 52/77] Trigger CI From 14d81726cbf2fa397222da9e6d6e2872450beb6a Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 12:54:31 +0000 Subject: [PATCH 53/77] Add test_column_name_clashes --- .../tests/groupby/test_frame_value_counts.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index c7da03dbae1b8..cadeabeb4255d 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -398,3 +398,26 @@ def test_mixed_groupings(normalize, expected_label, expected_values): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, expected_values", + [ + ("repeat", []), + ("level", []), + ], +) +def test_column_name_clashes(test, expected_values): + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) + if test == "repeat": + df.columns = list("abbde") + else: + df.columns = list("abcd") + ["level_1"] + print("\nDF") + print(df) + result = df.groupby(["a", [0, 1], "d"], as_index=False).value_counts() + print("\nRESULT") + print(result) + expected = pd.DataFrame({"a": [1, 2], "level_1": [0, 1], "d": [7, 8], "b": [3, 4], "c": [5, 6], "e": [9, 10], "count": [1, 1]}) + print(expected) + tm.assert_frame_equal(result, expected) From e26cba113fcb689bdcc7750262eda8a77c742858 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 13:40:48 +0000 Subject: [PATCH 54/77] Update test_frame_value_counts.py --- .../tests/groupby/test_frame_value_counts.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index cadeabeb4255d..14ec6d1f78016 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -401,23 +401,24 @@ def test_mixed_groupings(normalize, expected_label, expected_values): @pytest.mark.parametrize( - "test, expected_values", + "test, expected_names", [ - ("repeat", []), - ("level", []), + ("repeat", ["a", None, "d", "b", "b", "e"]), + ("level", ["a", None, "d", "b", "c", "level_1"]), ], ) -def test_column_name_clashes(test, expected_values): +def test_column_name_clashes(test, expected_names): df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) if test == "repeat": df.columns = list("abbde") else: df.columns = list("abcd") + ["level_1"] - print("\nDF") - print(df) - result = df.groupby(["a", [0, 1], "d"], as_index=False).value_counts() - print("\nRESULT") - print(result) - expected = pd.DataFrame({"a": [1, 2], "level_1": [0, 1], "d": [7, 8], "b": [3, 4], "c": [5, 6], "e": [9, 10], "count": [1, 1]}) - print(expected) - tm.assert_frame_equal(result, expected) + result = df.groupby(["a", [0, 1], "d"]).value_counts() + expected = pd.Series( + data=(1, 1), + index=pd.MultiIndex.from_tuples( + [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + names=expected_names, + ), + ) + tm.assert_series_equal(result, expected) From 928a9d7911449feabf577c71f43c7223085f39dd Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 14:02:24 +0000 Subject: [PATCH 55/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 14ec6d1f78016..fac0f65e5424a 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -408,7 +408,9 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ], ) def test_column_name_clashes(test, expected_names): - df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) + df = pd.DataFrame( + {"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]} + ) if test == "repeat": df.columns = list("abbde") else: From ad0f5b41765088e752dd2e05311b12c521d3fb82 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 16:26:25 +0000 Subject: [PATCH 56/77] Trigger CI From e827cd30a529699f58b60fbcf6717a028b918452 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 23:44:39 +0000 Subject: [PATCH 57/77] Trigger CI From 2c2b9670117f5cf656267ff6c2de8d782fb093ab Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 08:19:38 +0000 Subject: [PATCH 58/77] reset_index to cope with duplicate labels --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 12c6eaa86552f..9319f897ee68d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5847,7 +5847,7 @@ class max type level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert(0, name, level_values) + new_obj.insert(0, name, level_values, allow_duplicates=self.flags.allows_duplicate_labels) new_obj.index = new_index if not inplace: From 51a3a3e02868bc74aa5ccfa15eb67a741d9af7f9 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 08:21:56 +0000 Subject: [PATCH 59/77] Update frame.py --- pandas/core/frame.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9319f897ee68d..0abc9f14f2247 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5847,7 +5847,12 @@ class max type level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert(0, name, level_values, allow_duplicates=self.flags.allows_duplicate_labels) + new_obj.insert( + 0, + name, + level_values, + allow_duplicates=self.flags.allows_duplicate_labels, + ) new_obj.index = new_index if not inplace: From ec2a2d44bd10ba961379f2bd959620069c3576c6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 09:20:39 +0000 Subject: [PATCH 60/77] Update test_put.py --- pandas/tests/io/pytables/test_put.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 41addc5023436..5b73aae9a3541 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -351,7 +351,7 @@ def make_index(names=None): columns=["a", "b"], index=make_index(["date", "a", "t"]), ) - msg = "duplicate names/columns in the multi-index when storing as a table" + msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): store.append("df", df) From 8e4f3ed2efd27a0aca03034a19bd11735b789911 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 10:47:07 +0000 Subject: [PATCH 61/77] Comment out tests that now pass --- pandas/tests/frame/methods/test_reset_index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 43af48cf4a654..dd3e201b379f3 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -329,9 +329,9 @@ def test_reset_index_multiindex_nan(self): def test_reset_index_with_datetimeindex_cols(self, name): # GH#5818 warn = None - if isinstance(name, Timestamp) and name.tz is not None: - # _deprecate_mismatched_indexing - warn = FutureWarning +# JZ if isinstance(name, Timestamp) and name.tz is not None: +# # _deprecate_mismatched_indexing +# warn = FutureWarning df = DataFrame( [[1, 2], [3, 4]], @@ -375,9 +375,9 @@ def test_reset_index_multiindex_columns(self): tm.assert_frame_equal(result, df) # GH#16120: already existing column - msg = r"cannot insert \('A', ''\), already exists" - with pytest.raises(ValueError, match=msg): - df.rename_axis("A").reset_index() +# JZ msg = r"cannot insert \('A', ''\), already exists" +# with pytest.raises(ValueError, match=msg): +# df.rename_axis("A").reset_index() # GH#16164: multiindex (tuple) full key result = df.set_index([("A", "")]).reset_index() From b2c61ded51acf1b5141257c0cbba210d635b3719 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 10:57:12 +0000 Subject: [PATCH 62/77] Update test_reset_index.py --- pandas/tests/frame/methods/test_reset_index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index dd3e201b379f3..6a917ace08e86 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -329,9 +329,9 @@ def test_reset_index_multiindex_nan(self): def test_reset_index_with_datetimeindex_cols(self, name): # GH#5818 warn = None -# JZ if isinstance(name, Timestamp) and name.tz is not None: -# # _deprecate_mismatched_indexing -# warn = FutureWarning + # JZ if isinstance(name, Timestamp) and name.tz is not None: + # # _deprecate_mismatched_indexing + # warn = FutureWarning df = DataFrame( [[1, 2], [3, 4]], @@ -375,9 +375,9 @@ def test_reset_index_multiindex_columns(self): tm.assert_frame_equal(result, df) # GH#16120: already existing column -# JZ msg = r"cannot insert \('A', ''\), already exists" -# with pytest.raises(ValueError, match=msg): -# df.rename_axis("A").reset_index() + # JZ msg = r"cannot insert \('A', ''\), already exists" + # with pytest.raises(ValueError, match=msg): + # df.rename_axis("A").reset_index() # GH#16164: multiindex (tuple) full key result = df.set_index([("A", "")]).reset_index() From 392986d7b7b989465035c23ca8eca3c6f188d7ac Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 11:51:10 +0000 Subject: [PATCH 63/77] Trigger CI From 34e6529bf524d48a468465495efe72442f0ebe5c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 17:34:24 +0000 Subject: [PATCH 64/77] Update generic.py --- pandas/core/groupby/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ee5cd92b5f948..66ec6a6b2ee85 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1663,7 +1663,7 @@ def value_counts( keys = [] if name in in_axis_names else [self._selected_obj] else: keys = [ - # TODO: Is there a better way to get the Series for the grouper? + # Can't use .values because the column label needs to be preserved self._selected_obj.iloc[:, idx] for idx, name in enumerate(self._selected_obj.columns) if name not in in_axis_names @@ -1683,9 +1683,7 @@ def value_counts( df, key=key, axis=self.axis, - level=self.level, sort=self.sort, - mutated=self.mutated, dropna=dropna, ) groupings += list(grouper.groupings) From 91e1ff3890f253743f0bd945300c96945ac3b609 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 17:46:14 +0000 Subject: [PATCH 65/77] Update test_reset_index.py --- pandas/tests/frame/methods/test_reset_index.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 6a917ace08e86..1479c880155a8 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -328,21 +328,13 @@ def test_reset_index_multiindex_nan(self): ) def test_reset_index_with_datetimeindex_cols(self, name): # GH#5818 - warn = None - # JZ if isinstance(name, Timestamp) and name.tz is not None: - # # _deprecate_mismatched_indexing - # warn = FutureWarning - df = DataFrame( [[1, 2], [3, 4]], columns=date_range("1/1/2013", "1/2/2013"), index=["A", "B"], ) df.index.name = name - - with tm.assert_produces_warning(warn): - result = df.reset_index() - + result = df.reset_index() item = name if name is not None else "index" columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)]) if isinstance(item, str) and item == "2012-12-31": @@ -374,11 +366,6 @@ def test_reset_index_multiindex_columns(self): result = df[["B"]].rename_axis("A").reset_index() tm.assert_frame_equal(result, df) - # GH#16120: already existing column - # JZ msg = r"cannot insert \('A', ''\), already exists" - # with pytest.raises(ValueError, match=msg): - # df.rename_axis("A").reset_index() - # GH#16164: multiindex (tuple) full key result = df.set_index([("A", "")]).reset_index() tm.assert_frame_equal(result, df) From e062823db69c66c65448f76012d5e634928ec9f7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 5 Dec 2021 15:17:08 +0000 Subject: [PATCH 66/77] Improve test imports --- pandas/core/groupby/generic.py | 1 - .../tests/groupby/test_frame_value_counts.py | 64 ++++++++++--------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 66ec6a6b2ee85..4c6052d2c27ab 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1701,7 +1701,6 @@ def value_counts( # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the # user-requested grouping. - # TODO: Is there a better way to subset levels from a MultiIndex? levels = list(range(len(self.grouper.groupings), result.index.nlevels)) indexed_group_size = result.groupby( result.index.droplevel(levels), diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index fac0f65e5424a..1b61e2c741d6b 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -1,13 +1,19 @@ import numpy as np import pytest -import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, +) import pandas._testing as tm @pytest.fixture def education_df(): - return pd.DataFrame( + return DataFrame( { "gender": ["male", "male", "female", "male", "female", "male"], "education": ["low", "medium", "high", "low", "high", "low"], @@ -33,9 +39,9 @@ def test_basic(education_df): result = education_df.groupby("country")[["gender", "education"]].value_counts( normalize=True ) - expected = pd.Series( + expected = Series( data=[0.5, 0.25, 0.25, 0.5, 0.5], - index=pd.MultiIndex.from_tuples( + index=MultiIndex.from_tuples( [ ("FR", "male", "low"), ("FR", "female", "high"), @@ -119,7 +125,7 @@ def test_against_frame_and_seriesgroupby( index_frame["education"] = index_frame["both"].str.split("-").str.get(1) del index_frame["both"] index_frame = index_frame.rename({0: None}, axis=1) - expected.index = pd.MultiIndex.from_frame(index_frame) + expected.index = MultiIndex.from_frame(index_frame) tm.assert_series_equal(result, expected) else: expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) @@ -151,7 +157,7 @@ def test_compound( result = gp["education"].value_counts( normalize=normalize, sort=sort, ascending=ascending ) - expected = pd.DataFrame() + expected = DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] if normalize: @@ -164,7 +170,7 @@ def test_compound( @pytest.fixture def animals_df(): - return pd.DataFrame( + return DataFrame( {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, index=["falcon", "dog", "cat", "ant"], ) @@ -187,9 +193,9 @@ def test_data_frame_value_counts( result_frame = animals_df.value_counts( sort=sort, ascending=ascending, normalize=normalize ) - expected = pd.Series( + expected = Series( data=expected_data, - index=pd.MultiIndex.from_arrays( + index=MultiIndex.from_arrays( expected_index, names=["key", "num_legs", "num_wings"] ), ) @@ -205,7 +211,7 @@ def test_data_frame_value_counts( @pytest.fixture def nulls_df(): n = np.nan - return pd.DataFrame( + return DataFrame( { "A": [1, 1, n, 4, n, 6, 6, 6, 6], "B": [1, 1, 3, n, n, 6, 6, 6, 6], @@ -234,17 +240,17 @@ def test_dropna_combinations( ): gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) - columns = pd.DataFrame() + columns = DataFrame() for column in nulls_df.columns: columns[column] = [nulls_df[column][row] for row in expected_rows] - index = pd.MultiIndex.from_frame(columns) - expected = pd.Series(data=expected_values, index=index) + index = MultiIndex.from_frame(columns) + expected = Series(data=expected_values, index=index) tm.assert_series_equal(result, expected) @pytest.fixture def names_with_nulls_df(nulls_fixture): - return pd.DataFrame( + return DataFrame( { "key": [1, 1, 1, 1], "first_name": ["John", "Anne", "John", "Beth"], @@ -259,7 +265,7 @@ def names_with_nulls_df(nulls_fixture): ( True, [1, 1], - pd.MultiIndex.from_arrays( + MultiIndex.from_arrays( [(1, 1), ("Beth", "John"), ("Louise", "Smith")], names=["key", "first_name", "middle_name"], ), @@ -267,11 +273,11 @@ def names_with_nulls_df(nulls_fixture): ( False, [1, 1, 1, 1], - pd.MultiIndex( + MultiIndex( levels=[ - pd.Index([1]), - pd.Index(["Anne", "Beth", "John"]), - pd.Index(["Louise", "Smith", np.nan]), + Index([1]), + Index(["Anne", "Beth", "John"]), + Index(["Louise", "Smith", np.nan]), ], codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], names=["key", "first_name", "middle_name"], @@ -287,7 +293,7 @@ def test_data_frame_value_counts_dropna( # 3-way compare with :meth:`~DataFrame.value_counts` # Tests with nulls from frame/methods/test_value_counts.py result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) - expected = pd.Series( + expected = Series( data=expected_data, index=expected_index, ) @@ -355,16 +361,16 @@ def test_categorical( ) result = gp.value_counts(normalize=normalize) - expected_series = pd.Series( + expected_series = Series( data=expected_data[expected_data > 0.0] if observed else expected_data, - index=pd.MultiIndex.from_tuples( + index=MultiIndex.from_tuples( expected_index, names=["country", "gender", "education"], ), ) for i in range(3): expected_series.index = expected_series.index.set_levels( - pd.CategoricalIndex(expected_series.index.levels[i]), level=i + CategoricalIndex(expected_series.index.levels[i]), level=i ) if as_index: @@ -385,10 +391,10 @@ def test_categorical( ) def test_mixed_groupings(normalize, expected_label, expected_values): # Test multiple groupings - df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) result = gp.value_counts(sort=True, normalize=normalize) - expected = pd.DataFrame( + expected = DataFrame( { "level_0": [4, 4, 5], "A": [1, 1, 2], @@ -408,17 +414,15 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ], ) def test_column_name_clashes(test, expected_names): - df = pd.DataFrame( - {"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]} - ) + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) if test == "repeat": df.columns = list("abbde") else: df.columns = list("abcd") + ["level_1"] result = df.groupby(["a", [0, 1], "d"]).value_counts() - expected = pd.Series( + expected = Series( data=(1, 1), - index=pd.MultiIndex.from_tuples( + index=MultiIndex.from_tuples( [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], names=expected_names, ), From 493e3aaf05b2e3bb2e025f5ddd4a48579fa500aa Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 5 Dec 2021 19:17:36 +0000 Subject: [PATCH 67/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 1b61e2c741d6b..486f3259d569d 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -111,8 +111,7 @@ def test_against_frame_and_seriesgroupby( else: expected["level_0"] = np.where(expected["level_0"], "US", "FR") tm.assert_frame_equal(result, expected) - elif groupby == "column" or as_index: - # (otherwise SeriesGroupby crashes) + else: # compare against SeriesGroupBy value_counts education_df["both"] = education_df["gender"] + "-" + education_df["education"] expected = gp["both"].value_counts( From 548c45bd085280cb34dff71c85421dd81e271b01 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 5 Dec 2021 19:19:08 +0000 Subject: [PATCH 68/77] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 486f3259d569d..5f31daf64e105 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -81,7 +81,7 @@ def test_against_frame_and_seriesgroupby( # - Whether or not to use the groupby as an index # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` - # - `~SeriesGroupBy.value_counts` (apart from certain cases where it crashes) + # - `~SeriesGroupBy.value_counts` by = { "column": "country", "array": education_df["country"].values, From 6141f85312cbe91506314e55b23d69ae2f1d9e03 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 6 Dec 2021 12:16:04 +0000 Subject: [PATCH 69/77] Revert changes to reset_index() --- pandas/core/frame.py | 6 +----- pandas/tests/frame/methods/test_reset_index.py | 10 +++++++++- pandas/tests/io/pytables/test_put.py | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 52e10326cb4d0..0b16539948197 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5839,11 +5839,7 @@ class max type ) new_obj.insert( - 0, - name, - level_values, - allow_duplicates=self.flags.allows_duplicate_labels, - ) + new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 1479c880155a8..f5335c2cf7a25 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -334,7 +334,10 @@ def test_reset_index_with_datetimeindex_cols(self, name): index=["A", "B"], ) df.index.name = name - result = df.reset_index() + + with tm.assert_produces_warning(warn): + result = df.reset_index() + item = name if name is not None else "index" columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)]) if isinstance(item, str) and item == "2012-12-31": @@ -366,6 +369,11 @@ def test_reset_index_multiindex_columns(self): result = df[["B"]].rename_axis("A").reset_index() tm.assert_frame_equal(result, df) + # GH#16120: already existing column + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.rename_axis("A").reset_index() + # GH#16164: multiindex (tuple) full key result = df.set_index([("A", "")]).reset_index() tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 5b73aae9a3541..41addc5023436 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -351,7 +351,7 @@ def make_index(names=None): columns=["a", "b"], index=make_index(["date", "a", "t"]), ) - msg = "cannot reindex on an axis with duplicate labels" + msg = "duplicate names/columns in the multi-index when storing as a table" with pytest.raises(ValueError, match=msg): store.append("df", df) From 050f070af38bd5ac3c8551dba695a5e6b29c3821 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 6 Dec 2021 12:16:21 +0000 Subject: [PATCH 70/77] Update frame.py --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b16539948197..24bf86147d3e1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5838,7 +5838,6 @@ class max type level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert( new_obj.insert(0, name, level_values) new_obj.index = new_index From d669af3de0017101ae1f1f8ee1fa04abffebf5a3 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 6 Dec 2021 12:44:04 +0000 Subject: [PATCH 71/77] Add reset_index failure to test --- .../tests/frame/methods/test_reset_index.py | 7 ++++- .../tests/groupby/test_frame_value_counts.py | 26 ++++++++++++------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index f5335c2cf7a25..43af48cf4a654 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -328,13 +328,18 @@ def test_reset_index_multiindex_nan(self): ) def test_reset_index_with_datetimeindex_cols(self, name): # GH#5818 + warn = None + if isinstance(name, Timestamp) and name.tz is not None: + # _deprecate_mismatched_indexing + warn = FutureWarning + df = DataFrame( [[1, 2], [3, 4]], columns=date_range("1/1/2013", "1/2/2013"), index=["A", "B"], ) df.index.name = name - + with tm.assert_produces_warning(warn): result = df.reset_index() diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 5f31daf64e105..f4fb42dd025de 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -412,18 +412,24 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ("level", ["a", None, "d", "b", "c", "level_1"]), ], ) -def test_column_name_clashes(test, expected_names): +@pytest.mark.parametrize("as_index", [False, True]) +def test_column_name_clashes(test, expected_names, as_index): df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) if test == "repeat": df.columns = list("abbde") else: df.columns = list("abcd") + ["level_1"] - result = df.groupby(["a", [0, 1], "d"]).value_counts() - expected = Series( - data=(1, 1), - index=MultiIndex.from_tuples( - [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], - names=expected_names, - ), - ) - tm.assert_series_equal(result, expected) + + if as_index: + result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + expected = Series( + data=(1, 1), + index=MultiIndex.from_tuples( + [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + names=expected_names, + ), + ) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(ValueError, match="cannot insert"): + df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() From db3125784740ae1d980797d2438b31dcfe67aed0 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 12 Dec 2021 20:05:55 +0000 Subject: [PATCH 72/77] Add grouping test --- pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/test_frame_value_counts.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index da2c858e50643..1e6515084d3b7 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -800,7 +800,7 @@ def get_grouper( # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_groupers = any(isinstance(g, Grouper) or isinstance(g, Grouping) for g in keys) + any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys ) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index f4fb42dd025de..79ef46db8e95e 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -433,3 +433,12 @@ def test_column_name_clashes(test, expected_names, as_index): else: with pytest.raises(ValueError, match="cannot insert"): df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + + +def test_ambiguous_grouping(): + # Test that groupby is not confused by groupings length equal to row count + df = DataFrame({"a": [1, 1]}) + gb = df.groupby([1, 1]) + result = gb.value_counts() + expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) + tm.assert_series_equal(result, expected) From 124b1e96a4cb0a4dc7e436592732921a9ef67707 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 12 Dec 2021 21:32:40 +0000 Subject: [PATCH 73/77] Trigger CI From a776a3d575a9a0ef8b8ba843b7dffa468f021c74 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Dec 2021 23:33:38 +0000 Subject: [PATCH 74/77] Trigger CI From fe582456f6f536ac8498fd90802fadd3b9e63b57 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 17 Dec 2021 16:12:14 +0000 Subject: [PATCH 75/77] Update generic.py --- pandas/core/groupby/generic.py | 41 +++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4c6052d2c27ab..787ec8a466a8d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1613,7 +1613,8 @@ def value_counts( - If the groupby as_index is True then the returned Series will have a MultiIndex with one level per input column. - If the groupby as_index is False then the returned DataFrame will have an - additional column with the value_counts. + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. By default, rows that contain any NA values are omitted from the result. @@ -1638,7 +1639,25 @@ def value_counts( 4 female high FR 5 male low FR - >>> df.groupby("gender").value_counts(normalize=True) + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + dtype: float64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) gender education country female high FR 0.50 US 0.50 @@ -1646,6 +1665,22 @@ def value_counts( US 0.25 medium FR 0.25 dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 """ if self.axis == 1: raise NotImplementedError( @@ -1721,7 +1756,7 @@ def value_counts( if not self.as_index: # Convert to frame result = result.reset_index(name="proportion" if normalize else "count") - return result + return result.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( From 857e5be42630645a5985d8d46b509ba0a5bc8249 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 18 Dec 2021 00:49:12 +0000 Subject: [PATCH 76/77] Update generic.py --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 787ec8a466a8d..9b341845c7170 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1646,7 +1646,7 @@ def value_counts( male low FR 2 US 1 medium FR 1 - dtype: float64 + dtype: int64 >>> df.groupby('gender').value_counts(ascending=True) gender education country From c8f173149189bce00ba7bf887508b520928bb5c4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 18 Dec 2021 09:33:37 +0000 Subject: [PATCH 77/77] Trigger CI