BUG: Incorrect index shape when using a user-defined function for aggregating a grouped series with object-typed index. (pandas-dev#40835)

DriesSchaumont · JulianWgs · commit 6efc7d77a3fe · 2021-07-03T13:07:59.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -789,6 +789,7 @@ Groupby/resample/rolling
 - :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
 - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
+- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)
 - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
 - Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
 - Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -56,7 +56,7 @@ cdef class _BaseGrouper:
     cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
                                     Slider islider, Slider vslider):
         if cached_typ is None:
-            cached_ityp = self.ityp(islider.buf)
+            cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
             cached_typ = self.typ(
                 vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
             )
@@ -70,7 +70,6 @@ cdef class _BaseGrouper:
             cached_typ._mgr.set_values(vslider.buf)
             object.__setattr__(cached_typ, '_index', cached_ityp)
             object.__setattr__(cached_typ, 'name', self.name)
-
         return cached_typ, cached_ityp
 
     cdef inline object _apply_to_group(self,
@@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
 
     cdef public:
         ndarray arr, index, dummy_arr, dummy_index
-        object values, f, bins, typ, ityp, name
+        object values, f, bins, typ, ityp, name, idtype
 
     def __init__(self, object series, object f, object bins):
 
@@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
         self.arr = values
         self.typ = series._constructor
         self.ityp = series.index._constructor
+        self.idtype = series.index.dtype
         self.index = series.index.values
         self.name = series.name
 
@@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper):
 
     cdef public:
         ndarray arr, index, dummy_arr, dummy_index
-        object f, labels, values, typ, ityp, name
+        object f, labels, values, typ, ityp, name, idtype
 
     def __init__(self, object series, object f, ndarray[intp_t] labels,
                  Py_ssize_t ngroups):
@@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper):
         self.arr = values
         self.typ = series._constructor
         self.ityp = series.index._constructor
+        self.idtype = series.index.dtype
         self.index = series.index.values
         self.name = series.name
 
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1221,3 +1221,18 @@ def test_aggregate_numeric_object_dtype():
         {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
     ).set_index("key")
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_index_object_dtype():
+    # GH 40014
+    df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
+    df.index = df.index.astype("O")
+    grouped = df.groupby(["c0", "c1"])
+    res = grouped.p.agg(lambda x: all(x > 0))
+    # Check that providing a user-defined function in agg()
+    # produces the correct index shape when using an object-typed index.
+    expected_index = MultiIndex.from_tuples(
+        [("x", "x"), ("x", "y")], names=("c0", "c1")
+    )
+    expected = Series([False, True], index=expected_index, name="p")
+    tm.assert_series_equal(res, expected)
diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
@@ -27,6 +27,22 @@ def test_series_grouper():
     tm.assert_almost_equal(counts, exp_counts)
 
 
+def test_series_grouper_result_length_difference():
+    # GH 40014
+    obj = Series(np.random.randn(10), dtype="float64")
+    obj.index = obj.index.astype("O")
+    labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
+
+    grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
+    result, counts = grouper.get_result()
+
+    expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)])
+    tm.assert_equal(result, expected)
+
+    exp_counts = np.array([3, 4], dtype=np.int64)
+    tm.assert_equal(counts, exp_counts)
+
+
 def test_series_grouper_requires_nonempty_raises():
     # GH#29500
     obj = Series(np.random.randn(10))