From a9e99cd951d369276bdb903ae67602e2fe001703 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 16:42:20 +0200 Subject: [PATCH 1/6] BUG: value_counts returning incorrect dtype for string dtype --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/groupby/groupby.py | 15 +++++++++-- .../groupby/methods/test_value_counts.py | 25 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..70d817cab4a09 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55626`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e33c4b3579c69..955649b7d710b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,6 +108,10 @@ class providing the base-class of operations. SparseArray, ) from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) from pandas.core.base import ( PandasObject, SelectionMixin, @@ -2855,7 +2859,9 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - result_frame.columns = columns + [name] + result_frame.columns = Index( + columns + [name], dtype=self.grouper.groupings[0].obj.columns.dtype + ) result = result_frame return result.__finalize__(self.obj, method="value_counts") @@ -3007,7 +3013,12 @@ def size(self) -> DataFrame | Series: dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): - dtype_backend = "pyarrow" + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" elif isinstance(self.obj.array, BaseMaskedArray): dtype_backend = "numpy_nullable" # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index c1ee107715b71..c5c2eca813e86 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 + from pandas import ( Categorical, CategoricalIndex, @@ -369,6 +371,20 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param( + "string[pyarrow_numpy]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + pytest.param( + "string[pyarrow]", + marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), + ), + ], +) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -386,7 +402,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, + dtype, ): + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) result = gp["education"].value_counts( @@ -395,11 +414,17 @@ def test_compound( expected = DataFrame() for column in ["country", "gender", "education"]: expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) if normalize: expected["proportion"] = expected_count expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() tm.assert_frame_equal(result, expected) From b7f44bbd50427fd56bb9b00d3d82d407789aef7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 22 Oct 2023 16:50:35 +0200 Subject: [PATCH 2/6] Update v2.1.2.rst --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 70d817cab4a09..751e838ba0c96 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55626`) +- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) From 307c55ad99adb3c1075575a39de6c38524906531 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 22:25:53 +0200 Subject: [PATCH 3/6] Fix --- pandas/core/groupby/groupby.py | 5 ++--- pandas/tests/groupby/methods/test_value_counts.py | 12 +++--------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fe9d78e48a886..5654ee3a46486 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2858,9 +2858,8 @@ def _value_counts( result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - result_frame.columns = Index( - columns + [name], dtype=self.grouper.groupings[0].obj.columns.dtype - ) + orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501 + result_frame.columns = Index(columns + [name], dtype=orig_dtype) result = result_frame return result.__finalize__(self.obj, method="value_counts") diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index c5c2eca813e86..9dec6cde1c53b 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 +import pandas.util._test_decorators as td from pandas import ( Categorical, @@ -375,14 +375,8 @@ def test_against_frame_and_seriesgroupby( "dtype", [ object, - pytest.param( - "string[pyarrow_numpy]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), - ), - pytest.param( - "string[pyarrow]", - marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"), - ), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), ], ) @pytest.mark.parametrize("normalize", [True, False]) From 7b69172e56dc0452dac6a593a7f7e3cd60075f06 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 22:39:56 +0200 Subject: [PATCH 4/6] Add test --- pandas/core/groupby/groupby.py | 3 ++- pandas/tests/groupby/methods/test_value_counts.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5654ee3a46486..5cfb075b807f4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2859,7 +2859,8 @@ def _value_counts( result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501 - result_frame.columns = Index(columns + [name], dtype=orig_dtype) + cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) + result_frame.columns = cols result = result_frame return result.__finalize__(self.obj, method="value_counts") diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 9dec6cde1c53b..37f9d26284f52 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1165,3 +1165,14 @@ def test_value_counts_time_grouper(utc): ) expected = Series(1, index=index, name="count") tm.assert_series_equal(result, expected) + + +def test_value_counts_integer_columns(): + # GH#55627 + df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) + gp = df.groupby([1, 2], as_index=False, sort=False) + result = gp[3].value_counts() + expected = DataFrame( + {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} + ) + tm.assert_frame_equal(result, expected) From 7a13d8ca457728692d7853d0312bb76048252b8d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 23:22:34 +0200 Subject: [PATCH 5/6] Add test --- pandas/tests/groupby/methods/test_size.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index c275db9d1788c..862569c2f931e 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer_dtype from pandas import ( @@ -104,3 +106,24 @@ def test_size_series_masked_type_returns_Int64(dtype): result = ser.groupby(level=0).size() expected = Series([2, 1], dtype="Int64", index=["a", "b"]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_size_strings(dtype): + df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) + result = df.groupby("a")["b"].size() + exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + expected = Series( + [2, 1], + index=Index(["a", "b"], name="a", dtype=dtype), + name="b", + dtype=exp_dtype, + ) + tm.assert_series_equal(result, expected) From e16c29e544fcc95d537d29becfe3496d435fb7ce Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 24 Oct 2023 23:38:42 +0200 Subject: [PATCH 6/6] Add gh ref --- pandas/tests/groupby/methods/test_size.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 862569c2f931e..93a4e743d0d71 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -117,6 +117,7 @@ def test_size_series_masked_type_returns_Int64(dtype): ], ) def test_size_strings(dtype): + # GH#55627 df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"