Skip to content

Commit 6efc7d7

Browse files
DriesSchaumontJulianWgs
authored andcommitted
BUG: Incorrect index shape when using a user-defined function for aggregating a grouped series with object-typed index. (pandas-dev#40835)
1 parent 60d0fe1 commit 6efc7d7

File tree

4 files changed

+37
-4
lines changed

4 files changed

+37
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,7 @@ Groupby/resample/rolling
789789
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
790790
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
791791
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
792+
- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)
792793
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
793794
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
794795
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)

pandas/_libs/reduction.pyx

+5-4
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ cdef class _BaseGrouper:
5656
cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
5757
Slider islider, Slider vslider):
5858
if cached_typ is None:
59-
cached_ityp = self.ityp(islider.buf)
59+
cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
6060
cached_typ = self.typ(
6161
vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
6262
)
@@ -70,7 +70,6 @@ cdef class _BaseGrouper:
7070
cached_typ._mgr.set_values(vslider.buf)
7171
object.__setattr__(cached_typ, '_index', cached_ityp)
7272
object.__setattr__(cached_typ, 'name', self.name)
73-
7473
return cached_typ, cached_ityp
7574

7675
cdef inline object _apply_to_group(self,
@@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
106105

107106
cdef public:
108107
ndarray arr, index, dummy_arr, dummy_index
109-
object values, f, bins, typ, ityp, name
108+
object values, f, bins, typ, ityp, name, idtype
110109

111110
def __init__(self, object series, object f, object bins):
112111

@@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
122121
self.arr = values
123122
self.typ = series._constructor
124123
self.ityp = series.index._constructor
124+
self.idtype = series.index.dtype
125125
self.index = series.index.values
126126
self.name = series.name
127127

@@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper):
199199

200200
cdef public:
201201
ndarray arr, index, dummy_arr, dummy_index
202-
object f, labels, values, typ, ityp, name
202+
object f, labels, values, typ, ityp, name, idtype
203203

204204
def __init__(self, object series, object f, ndarray[intp_t] labels,
205205
Py_ssize_t ngroups):
@@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper):
218218
self.arr = values
219219
self.typ = series._constructor
220220
self.ityp = series.index._constructor
221+
self.idtype = series.index.dtype
221222
self.index = series.index.values
222223
self.name = series.name
223224

pandas/tests/groupby/aggregate/test_aggregate.py

+15
Original file line numberDiff line numberDiff line change
@@ -1221,3 +1221,18 @@ def test_aggregate_numeric_object_dtype():
12211221
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
12221222
).set_index("key")
12231223
tm.assert_frame_equal(result, expected)
1224+
1225+
1226+
def test_groupby_index_object_dtype():
1227+
# GH 40014
1228+
df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
1229+
df.index = df.index.astype("O")
1230+
grouped = df.groupby(["c0", "c1"])
1231+
res = grouped.p.agg(lambda x: all(x > 0))
1232+
# Check that providing a user-defined function in agg()
1233+
# produces the correct index shape when using an object-typed index.
1234+
expected_index = MultiIndex.from_tuples(
1235+
[("x", "x"), ("x", "y")], names=("c0", "c1")
1236+
)
1237+
expected = Series([False, True], index=expected_index, name="p")
1238+
tm.assert_series_equal(res, expected)

pandas/tests/groupby/test_bin_groupby.py

+16
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,22 @@ def test_series_grouper():
2727
tm.assert_almost_equal(counts, exp_counts)
2828

2929

30+
def test_series_grouper_result_length_difference():
31+
# GH 40014
32+
obj = Series(np.random.randn(10), dtype="float64")
33+
obj.index = obj.index.astype("O")
34+
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
35+
36+
grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
37+
result, counts = grouper.get_result()
38+
39+
expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)])
40+
tm.assert_equal(result, expected)
41+
42+
exp_counts = np.array([3, 4], dtype=np.int64)
43+
tm.assert_equal(counts, exp_counts)
44+
45+
3046
def test_series_grouper_requires_nonempty_raises():
3147
# GH#29500
3248
obj = Series(np.random.randn(10))

0 commit comments

Comments
 (0)