Skip to content

Commit 87cb141

Browse files
Backport PR pandas-dev#55627 on branch 2.1.x (BUG: value_counts returning incorrect dtype for string dtype) (pandas-dev#55682)
Backport PR pandas-dev#55627: BUG: value_counts returning incorrect dtype for string dtype Co-authored-by: Patrick Hoefler <[email protected]>
1 parent b7aaaf9 commit 87cb141

File tree

4 files changed

+68
-2
lines changed

4 files changed

+68
-2
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Fixed regressions
2525
Bug fixes
2626
~~~~~~~~~
2727
- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
28+
- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`)
2829
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
2930
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
3031
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)

pandas/core/groupby/groupby.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ class providing the base-class of operations.
108108
SparseArray,
109109
)
110110
from pandas.core.arrays.string_ import StringDtype
111+
from pandas.core.arrays.string_arrow import (
112+
ArrowStringArray,
113+
ArrowStringArrayNumpySemantics,
114+
)
111115
from pandas.core.base import (
112116
PandasObject,
113117
SelectionMixin,
@@ -2803,7 +2807,9 @@ def _value_counts(
28032807
result_series.name = name
28042808
result_series.index = index.set_names(range(len(columns)))
28052809
result_frame = result_series.reset_index()
2806-
result_frame.columns = columns + [name]
2810+
orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501
2811+
cols = Index(columns, dtype=orig_dtype).insert(len(columns), name)
2812+
result_frame.columns = cols
28072813
result = result_frame
28082814
return result.__finalize__(self.obj, method="value_counts")
28092815

@@ -2955,7 +2961,12 @@ def size(self) -> DataFrame | Series:
29552961
dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
29562962
if isinstance(self.obj, Series):
29572963
if isinstance(self.obj.array, ArrowExtensionArray):
2958-
dtype_backend = "pyarrow"
2964+
if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):
2965+
dtype_backend = None
2966+
elif isinstance(self.obj.array, ArrowStringArray):
2967+
dtype_backend = "numpy_nullable"
2968+
else:
2969+
dtype_backend = "pyarrow"
29592970
elif isinstance(self.obj.array, BaseMaskedArray):
29602971
dtype_backend = "numpy_nullable"
29612972
# TODO: For DataFrames what if columns are mixed arrow/numpy/masked?

pandas/tests/groupby/test_size.py

+24
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
import pandas.util._test_decorators as td
5+
46
from pandas.core.dtypes.common import is_integer_dtype
57

68
from pandas import (
@@ -104,3 +106,25 @@ def test_size_series_masked_type_returns_Int64(dtype):
104106
result = ser.groupby(level=0).size()
105107
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
106108
tm.assert_series_equal(result, expected)
109+
110+
111+
@pytest.mark.parametrize(
112+
"dtype",
113+
[
114+
object,
115+
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
116+
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
117+
],
118+
)
119+
def test_size_strings(dtype):
120+
# GH#55627
121+
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
122+
result = df.groupby("a")["b"].size()
123+
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
124+
expected = Series(
125+
[2, 1],
126+
index=Index(["a", "b"], name="a", dtype=dtype),
127+
name="b",
128+
dtype=exp_dtype,
129+
)
130+
tm.assert_series_equal(result, expected)

pandas/tests/groupby/test_value_counts.py

+30
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import numpy as np
1010
import pytest
1111

12+
import pandas.util._test_decorators as td
13+
1214
from pandas import (
1315
Categorical,
1416
CategoricalIndex,
@@ -366,6 +368,14 @@ def test_against_frame_and_seriesgroupby(
366368
tm.assert_frame_equal(result, expected)
367369

368370

371+
@pytest.mark.parametrize(
372+
"dtype",
373+
[
374+
object,
375+
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
376+
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
377+
],
378+
)
369379
@pytest.mark.parametrize("normalize", [True, False])
370380
@pytest.mark.parametrize(
371381
"sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -383,7 +393,10 @@ def test_compound(
383393
expected_rows,
384394
expected_count,
385395
expected_group_size,
396+
dtype,
386397
):
398+
education_df = education_df.astype(dtype)
399+
education_df.columns = education_df.columns.astype(dtype)
387400
# Multiple groupby keys and as_index=False
388401
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
389402
result = gp["education"].value_counts(
@@ -392,11 +405,17 @@ def test_compound(
392405
expected = DataFrame()
393406
for column in ["country", "gender", "education"]:
394407
expected[column] = [education_df[column][row] for row in expected_rows]
408+
expected = expected.astype(dtype)
409+
expected.columns = expected.columns.astype(dtype)
395410
if normalize:
396411
expected["proportion"] = expected_count
397412
expected["proportion"] /= expected_group_size
413+
if dtype == "string[pyarrow]":
414+
expected["proportion"] = expected["proportion"].convert_dtypes()
398415
else:
399416
expected["count"] = expected_count
417+
if dtype == "string[pyarrow]":
418+
expected["count"] = expected["count"].convert_dtypes()
400419
tm.assert_frame_equal(result, expected)
401420

402421

@@ -1143,3 +1162,14 @@ def test_value_counts_time_grouper(utc):
11431162
)
11441163
expected = Series(1, index=index, name="count")
11451164
tm.assert_series_equal(result, expected)
1165+
1166+
1167+
def test_value_counts_integer_columns():
1168+
# GH#55627
1169+
df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]})
1170+
gp = df.groupby([1, 2], as_index=False, sort=False)
1171+
result = gp[3].value_counts()
1172+
expected = DataFrame(
1173+
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
1174+
)
1175+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)