Skip to content

Commit c26935c

Browse files
Backport PR #55620 on branch 2.1.x (BUG: Groupby not keeping object dtype when infer string is set) (#55629)
Backport PR #55620: BUG: Groupby not keeping object dtype when infer string is set Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 3c1fe5c commit c26935c

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed regressions
2323

2424
Bug fixes
2525
~~~~~~~~~
26+
- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
2627
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
2728
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
2829
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)

pandas/core/groupby/groupby.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1855,8 +1855,7 @@ def _agg_py_fallback(
18551855
ser = Series(values, copy=False)
18561856
else:
18571857
# We only get here with values.dtype == object
1858-
# TODO: special case not needed with ArrayManager
1859-
df = DataFrame(values.T)
1858+
df = DataFrame(values.T, dtype=values.dtype)
18601859
# bc we split object blocks in grouped_reduce, we have only 1 col
18611860
# otherwise we'd have to worry about block-splitting GH#39329
18621861
assert df.shape[1] == 1

pandas/tests/groupby/test_groupby.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
import pytest
77

8+
from pandas.compat import pa_version_under7p0
89
from pandas.errors import (
910
PerformanceWarning,
1011
SpecificationError,
@@ -2513,13 +2514,24 @@ def test_groupby_column_index_name_lost(func):
25132514
tm.assert_index_equal(result, expected)
25142515

25152516

2516-
def test_groupby_duplicate_columns():
2517+
@pytest.mark.parametrize(
2518+
"infer_string",
2519+
[
2520+
False,
2521+
pytest.param(
2522+
True,
2523+
marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
2524+
),
2525+
],
2526+
)
2527+
def test_groupby_duplicate_columns(infer_string):
25172528
# GH: 31735
25182529
df = DataFrame(
25192530
{"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
25202531
).astype(object)
25212532
df.columns = ["A", "B", "B"]
2522-
result = df.groupby([0, 0, 0, 0]).min()
2533+
with pd.option_context("future.infer_string", infer_string):
2534+
result = df.groupby([0, 0, 0, 0]).min()
25232535
expected = DataFrame(
25242536
[["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
25252537
)

0 commit comments

Comments
 (0)