From 696130bfcf5318f7e291969ca0a114dba644b77a Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 31 Dec 2021 21:46:45 +0000 Subject: [PATCH 01/12] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 79ef46db8e95e..72dbbc5778a20 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -406,19 +406,15 @@ def test_mixed_groupings(normalize, expected_label, expected_values): @pytest.mark.parametrize( - "test, expected_names", + "test, columns, expected_names", [ - ("repeat", ["a", None, "d", "b", "b", "e"]), - ("level", ["a", None, "d", "b", "c", "level_1"]), + ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]), + ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), ], ) @pytest.mark.parametrize("as_index", [False, True]) -def test_column_name_clashes(test, expected_names, as_index): - df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) - if test == "repeat": - df.columns = list("abbde") - else: - df.columns = list("abcd") + ["level_1"] +def test_column_name_clashes(test, columns, expected_names, as_index): + df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) if as_index: result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() From 6b039893ff9513d0adca4ce4e23d811a3a0cf830 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 1 Jan 2022 18:45:02 +0000 Subject: [PATCH 02/12] Implement value_counts with duplicates and add test --- pandas/core/groupby/generic.py | 32 ++++++++++++++--- .../tests/groupby/test_frame_value_counts.py | 34 +++++++++++++++---- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9b341845c7170..eb554fc26ed11 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,7 +26,10 @@ import numpy as np -from pandas._libs import reduction as libreduction +from pandas._libs import ( + lib, + reduction as libreduction, +) from pandas._typing import ( ArrayLike, Manager, @@ -1730,7 +1733,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = cast(Series, gb.size()) + result = gb.size() if normalize: # Normalize the results by dividing by the original group sizes. @@ -1749,13 +1752,32 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = result.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False + result = ( + cast(Series, result) + .sort_values(ascending=ascending) + .sort_index(level=index_level, sort_remaining=False) ) if not self.as_index: # Convert to frame - result = result.reset_index(name="proportion" if normalize else "count") + name = "proportion" if normalize else "count" + columns = result.index.names + if name in columns: + raise ValueError( + f"Column label '{name}' is duplicate of result column" + ) + columns = com.fill_missing_names(columns) + values = result.values + result_frame = DataFrame() + for i, column in enumerate(columns): + level_values = result.index.get_level_values(i)._values + if level_values.dtype == np.object_: + level_values = lib.maybe_convert_objects( + cast(np.ndarray, level_values) + ) + result_frame.insert(i, column, level_values, allow_duplicates=True) + result = result_frame.assign(**{name: values}) + return result.__finalize__(self.obj, method="value_counts") diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 72dbbc5778a20..02f9226f67409 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -413,22 +413,44 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ], ) @pytest.mark.parametrize("as_index", [False, True]) -def test_column_name_clashes(test, columns, expected_names, as_index): +def test_column_label_duplicates(test, columns, expected_names, as_index): + # Test for duplicate input column labels and generated duplicate labels df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) - + expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] + result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() if as_index: - result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() expected = Series( data=(1, 1), index=MultiIndex.from_tuples( - [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + expected_data, names=expected_names, ), ) tm.assert_series_equal(result, expected) else: - with pytest.raises(ValueError, match="cannot insert"): - df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + expected_data = [list(row) + [1] for row in expected_data] + expected_columns = list(expected_names) + expected_columns[1] = "level_1" + expected_columns.append("count") + expected = DataFrame(expected_data, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label", + [ + (False, "count"), + (True, "proportion"), + ], +) +def test_result_label_duplicates(normalize, expected_label): + # Test for result column label duplicating an input column label + gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby( + "a", as_index=False + ) + msg = f"Column label '{expected_label}' is duplicate of result column" + with pytest.raises(ValueError, match=msg): + gb.value_counts(normalize=normalize) def test_ambiguous_grouping(): From 90933743f289abab5673943fb3a398a00c87b76b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 2 Jan 2022 20:09:29 +0000 Subject: [PATCH 03/12] redo using private methods --- pandas/core/frame.py | 16 +++++++++++++++- pandas/core/groupby/generic.py | 34 +++++++++++----------------------- pandas/core/series.py | 19 ++++++++++++++++++- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a39c1b0bf43f2..c2a59deca130b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5776,6 +5776,20 @@ class max type lion mammal 80.5 run monkey mammal NaN jump """ + return self._reset_index(level, drop, inplace, col_level, col_fill) + + def _reset_index( + self, + level: Hashable | Sequence[Hashable] | None = None, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: bool = False, + ) -> DataFrame | None: + """ + Private version of reset_index with additional allow_duplicates parameter + """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: @@ -5833,7 +5847,7 @@ class max type level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert(0, name, level_values) + new_obj.insert(0, name, level_values, allow_duplicates=allow_duplicates) new_obj.index = new_index if not inplace: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9945795e6c60b..2d4116c05d598 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,10 +26,7 @@ import numpy as np -from pandas._libs import ( - lib, - reduction as libreduction, -) +from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, Manager, @@ -1734,7 +1731,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = gb.size() + result = cast(Series, gb.size()) if normalize: # Normalize the results by dividing by the original group sizes. @@ -1753,13 +1750,13 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = ( - cast(Series, result) - .sort_values(ascending=ascending) - .sort_index(level=index_level, sort_remaining=False) + result = result.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False ) - if not self.as_index: + if self.as_index: + return result.__finalize__(self.obj, method="value_counts") + else: # Convert to frame name = "proportion" if normalize else "count" columns = result.index.names @@ -1767,19 +1764,10 @@ def value_counts( raise ValueError( f"Column label '{name}' is duplicate of result column" ) - columns = com.fill_missing_names(columns) - values = result.values - result_frame = DataFrame() - for i, column in enumerate(columns): - level_values = result.index.get_level_values(i)._values - if level_values.dtype == np.object_: - level_values = lib.maybe_convert_objects( - cast(np.ndarray, level_values) - ) - result_frame.insert(i, column, level_values, allow_duplicates=True) - result = result_frame.assign(**{name: values}) - - return result.__finalize__(self.obj, method="value_counts") + result_frame = cast( + DataFrame, result._reset_index(name=name, allow_duplicates=True) + ) + return result_frame.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( diff --git a/pandas/core/series.py b/pandas/core/series.py index 81b901b13a42b..f619741424c5b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1462,6 +1462,19 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False 2 baz one 2 3 baz two 3 """ + return self._reset_index(level, drop, name, inplace) + + def _reset_index( + self, + level: Hashable | Sequence[Hashable] | None = None, + drop: bool = False, + name: Hashable | lib.NoDefault = lib.no_default, + inplace: bool = False, + allow_duplicates: bool = False, + ) -> Series | DataFrame | None: + """ + Private version of reset_index with additional allow_duplicates parameter + """ inplace = validate_bool_kwarg(inplace, "inplace") if drop: new_index = default_index(len(self)) @@ -1492,7 +1505,11 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False name = self.name df = self.to_frame(name) - return df.reset_index(level=level, drop=drop) + return df._reset_index( + level=level, drop=drop, allow_duplicates=allow_duplicates + ) + + return None # ---------------------------------------------------------------------- # Rendering Methods From 4f65829160571e350085d967a46eb58df4e1ea57 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 3 Jan 2022 17:29:48 +0000 Subject: [PATCH 04/12] Update test_frame_value_counts.py --- pandas/tests/groupby/test_frame_value_counts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 02f9226f67409..affef05ba4ed3 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -414,6 +414,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): ) @pytest.mark.parametrize("as_index", [False, True]) def test_column_label_duplicates(test, columns, expected_names, as_index): + # GH 44992 # Test for duplicate input column labels and generated duplicate labels df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] From faa17e5e82f8bf6877cf86cc6b2568516990c98f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 4 Jan 2022 09:42:55 +0000 Subject: [PATCH 05/12] Revert "redo using private methods" This reverts commit 90933743f289abab5673943fb3a398a00c87b76b. --- pandas/core/frame.py | 16 +--------------- pandas/core/groupby/generic.py | 34 +++++++++++++++++++++++----------- pandas/core/series.py | 19 +------------------ 3 files changed, 25 insertions(+), 44 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c2a59deca130b..a39c1b0bf43f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5776,20 +5776,6 @@ class max type lion mammal 80.5 run monkey mammal NaN jump """ - return self._reset_index(level, drop, inplace, col_level, col_fill) - - def _reset_index( - self, - level: Hashable | Sequence[Hashable] | None = None, - drop: bool = False, - inplace: bool = False, - col_level: Hashable = 0, - col_fill: Hashable = "", - allow_duplicates: bool = False, - ) -> DataFrame | None: - """ - Private version of reset_index with additional allow_duplicates parameter - """ inplace = validate_bool_kwarg(inplace, "inplace") self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: @@ -5847,7 +5833,7 @@ def _reset_index( level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert(0, name, level_values, allow_duplicates=allow_duplicates) + new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2d4116c05d598..9945795e6c60b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,7 +26,10 @@ import numpy as np -from pandas._libs import reduction as libreduction +from pandas._libs import ( + lib, + reduction as libreduction, +) from pandas._typing import ( ArrayLike, Manager, @@ -1731,7 +1734,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = cast(Series, gb.size()) + result = gb.size() if normalize: # Normalize the results by dividing by the original group sizes. @@ -1750,13 +1753,13 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = result.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False + result = ( + cast(Series, result) + .sort_values(ascending=ascending) + .sort_index(level=index_level, sort_remaining=False) ) - if self.as_index: - return result.__finalize__(self.obj, method="value_counts") - else: + if not self.as_index: # Convert to frame name = "proportion" if normalize else "count" columns = result.index.names @@ -1764,10 +1767,19 @@ def value_counts( raise ValueError( f"Column label '{name}' is duplicate of result column" ) - result_frame = cast( - DataFrame, result._reset_index(name=name, allow_duplicates=True) - ) - return result_frame.__finalize__(self.obj, method="value_counts") + columns = com.fill_missing_names(columns) + values = result.values + result_frame = DataFrame() + for i, column in enumerate(columns): + level_values = result.index.get_level_values(i)._values + if level_values.dtype == np.object_: + level_values = lib.maybe_convert_objects( + cast(np.ndarray, level_values) + ) + result_frame.insert(i, column, level_values, allow_duplicates=True) + result = result_frame.assign(**{name: values}) + + return result.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( diff --git a/pandas/core/series.py b/pandas/core/series.py index f619741424c5b..81b901b13a42b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1462,19 +1462,6 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False 2 baz one 2 3 baz two 3 """ - return self._reset_index(level, drop, name, inplace) - - def _reset_index( - self, - level: Hashable | Sequence[Hashable] | None = None, - drop: bool = False, - name: Hashable | lib.NoDefault = lib.no_default, - inplace: bool = False, - allow_duplicates: bool = False, - ) -> Series | DataFrame | None: - """ - Private version of reset_index with additional allow_duplicates parameter - """ inplace = validate_bool_kwarg(inplace, "inplace") if drop: new_index = default_index(len(self)) @@ -1505,11 +1492,7 @@ def _reset_index( name = self.name df = self.to_frame(name) - return df._reset_index( - level=level, drop=drop, allow_duplicates=allow_duplicates - ) - - return None + return df.reset_index(level=level, drop=drop) # ---------------------------------------------------------------------- # Rendering Methods From c097e5d0bd881682b8c055e4ee8cf804f51160a0 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 4 Jan 2022 12:28:01 +0000 Subject: [PATCH 06/12] Update generic.py --- pandas/core/groupby/generic.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9945795e6c60b..399df608907d4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1734,7 +1734,7 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = gb.size() + result = cast(Series, gb.size()) if normalize: # Normalize the results by dividing by the original group sizes. @@ -1753,13 +1753,13 @@ def value_counts( if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = ( - cast(Series, result) - .sort_values(ascending=ascending) - .sort_index(level=index_level, sort_remaining=False) + result = result.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False ) - if not self.as_index: + if self.as_index: + return result.__finalize__(self.obj, method="value_counts") + else: # Convert to frame name = "proportion" if normalize else "count" columns = result.index.names @@ -1768,7 +1768,6 @@ def value_counts( f"Column label '{name}' is duplicate of result column" ) columns = com.fill_missing_names(columns) - values = result.values result_frame = DataFrame() for i, column in enumerate(columns): level_values = result.index.get_level_values(i)._values @@ -1777,9 +1776,8 @@ def value_counts( cast(np.ndarray, level_values) ) result_frame.insert(i, column, level_values, allow_duplicates=True) - result = result_frame.assign(**{name: values}) - - return result.__finalize__(self.obj, method="value_counts") + result_frame = result_frame.assign(**{name: result._values}) + return result_frame.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( From d4901878e92e2667abaf9c6d89f1dc3fe057220a Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 4 Jan 2022 13:13:51 +0000 Subject: [PATCH 07/12] Update generic.py --- pandas/core/groupby/generic.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 399df608907d4..534d44757311b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -86,6 +86,7 @@ MultiIndex, all_indexes_same, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -1768,7 +1769,7 @@ def value_counts( f"Column label '{name}' is duplicate of result column" ) columns = com.fill_missing_names(columns) - result_frame = DataFrame() + result_frame = DataFrame(index=result.index) for i, column in enumerate(columns): level_values = result.index.get_level_values(i)._values if level_values.dtype == np.object_: @@ -1776,8 +1777,12 @@ def value_counts( cast(np.ndarray, level_values) ) result_frame.insert(i, column, level_values, allow_duplicates=True) - result_frame = result_frame.assign(**{name: result._values}) - return result_frame.__finalize__(self.obj, method="value_counts") + result.name = name + return ( + concat([result_frame, result], axis=1) + .reset_index(drop=True) + .__finalize__(self.obj, method="value_counts") + ) def _wrap_transform_general_frame( From 89c90c44ba9d7f90096e16602dec824049e45c7b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 4 Jan 2022 13:36:18 +0000 Subject: [PATCH 08/12] Update generic.py --- pandas/core/groupby/generic.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 534d44757311b..a8752c8a25ed1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1769,7 +1769,7 @@ def value_counts( f"Column label '{name}' is duplicate of result column" ) columns = com.fill_missing_names(columns) - result_frame = DataFrame(index=result.index) + result_frame = DataFrame() for i, column in enumerate(columns): level_values = result.index.get_level_values(i)._values if level_values.dtype == np.object_: @@ -1777,12 +1777,8 @@ def value_counts( cast(np.ndarray, level_values) ) result_frame.insert(i, column, level_values, allow_duplicates=True) - result.name = name - return ( - concat([result_frame, result], axis=1) - .reset_index(drop=True) - .__finalize__(self.obj, method="value_counts") - ) + result_frame.insert(len(columns), name, result._values) + return result_frame.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( From 92999fb45a1b02f199fc48a1cbed4d17e4635fe0 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 4 Jan 2022 13:36:54 +0000 Subject: [PATCH 09/12] Update generic.py --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a8752c8a25ed1..2a17fe637bbd1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -86,7 +86,6 @@ MultiIndex, all_indexes_same, ) -from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba From 6e5567044c28f42b9f6656ea9f1d668a374145cd Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 9 Jan 2022 17:17:58 +0000 Subject: [PATCH 10/12] Back to reset_index --- pandas/core/groupby/generic.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2a17fe637bbd1..879f18a9fc172 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -26,10 +26,7 @@ import numpy as np -from pandas._libs import ( - lib, - reduction as libreduction, -) +from pandas._libs import reduction as libreduction from pandas._typing import ( ArrayLike, Manager, @@ -1762,21 +1759,16 @@ def value_counts( else: # Convert to frame name = "proportion" if normalize else "count" - columns = result.index.names + index = result.index + columns = com.fill_missing_names(index.names) if name in columns: raise ValueError( f"Column label '{name}' is duplicate of result column" ) - columns = com.fill_missing_names(columns) - result_frame = DataFrame() - for i, column in enumerate(columns): - level_values = result.index.get_level_values(i)._values - if level_values.dtype == np.object_: - level_values = lib.maybe_convert_objects( - cast(np.ndarray, level_values) - ) - result_frame.insert(i, column, level_values, allow_duplicates=True) - result_frame.insert(len(columns), name, result._values) + result.name = name + result.index = index.set_names(range(len(columns))) + result_frame = result.reset_index() + result_frame = result_frame.set_axis(columns + [name], axis=1) return result_frame.__finalize__(self.obj, method="value_counts") From fece32b1003e2f87b599ab692d580c65488ce5c5 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 16 Jan 2022 23:18:26 +0000 Subject: [PATCH 11/12] Update generic.py --- pandas/core/groupby/generic.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 879f18a9fc172..29411b9c722a9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1731,45 +1731,49 @@ def value_counts( observed=self.observed, dropna=self.dropna, ) - result = cast(Series, gb.size()) + result_series = cast(Series, gb.size()) if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the # user-requested grouping. - levels = list(range(len(self.grouper.groupings), result.index.nlevels)) - indexed_group_size = result.groupby( - result.index.droplevel(levels), + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), sort=self.sort, observed=self.observed, dropna=self.dropna, ).transform("sum") - result /= indexed_group_size + result_series /= indexed_group_size if sort: # Sort the values and then resort by the main grouping index_level = range(len(self.grouper.groupings)) - result = result.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) + result_series = result_series.sort_values( + ascending=ascending + ).sort_index(level=index_level, sort_remaining=False) + result: Series | DataFrame if self.as_index: - return result.__finalize__(self.obj, method="value_counts") + result = result_series else: # Convert to frame name = "proportion" if normalize else "count" - index = result.index + index = result_series.index columns = com.fill_missing_names(index.names) if name in columns: raise ValueError( f"Column label '{name}' is duplicate of result column" ) - result.name = name - result.index = index.set_names(range(len(columns))) - result_frame = result.reset_index() - result_frame = result_frame.set_axis(columns + [name], axis=1) - return result_frame.__finalize__(self.obj, method="value_counts") + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + result_frame.columns = columns + [name] + result = result_frame + return result.__finalize__(self.obj, method="value_counts") def _wrap_transform_general_frame( From 36f2b0d00425b568d09988cb99066da936ed85ff Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 16 Jan 2022 23:21:56 +0000 Subject: [PATCH 12/12] Trigger CI