Skip to content

BUG: Incorrect index shape when using a user-defined function for agg… #179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,7 @@ Groupby/resample/rolling
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ cdef class _BaseGrouper:
cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
Slider islider, Slider vslider):
if cached_typ is None:
cached_ityp = self.ityp(islider.buf)
cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
cached_typ = self.typ(
vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
)
Expand All @@ -70,7 +70,6 @@ cdef class _BaseGrouper:
cached_typ._mgr.set_values(vslider.buf)
object.__setattr__(cached_typ, '_index', cached_ityp)
object.__setattr__(cached_typ, 'name', self.name)

return cached_typ, cached_ityp

cdef inline object _apply_to_group(self,
Expand Down Expand Up @@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object values, f, bins, typ, ityp, name
object values, f, bins, typ, ityp, name, idtype

def __init__(self, object series, object f, object bins):

Expand All @@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down Expand Up @@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object f, labels, values, typ, ityp, name
object f, labels, values, typ, ityp, name, idtype

def __init__(self, object series, object f, ndarray[intp_t] labels,
Py_ssize_t ngroups):
Expand All @@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1221,3 +1221,18 @@ def test_aggregate_numeric_object_dtype():
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)


def test_groupby_index_object_dtype():
# GH 40014
df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
df.index = df.index.astype("O")
grouped = df.groupby(["c0", "c1"])
res = grouped.p.agg(lambda x: all(x > 0))
# Check that providing a user-defined function in agg()
# produces the correct index shape when using an object-typed index.
expected_index = MultiIndex.from_tuples(
[("x", "x"), ("x", "y")], names=("c0", "c1")
)
expected = Series([False, True], index=expected_index, name="p")
tm.assert_series_equal(res, expected)
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ def test_series_grouper():
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_result_length_difference():
# GH 40014
obj = Series(np.random.randn(10), dtype="float64")
obj.index = obj.index.astype("O")
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
result, counts = grouper.get_result()

expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)])
tm.assert_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty_raises():
# GH#29500
obj = Series(np.random.randn(10))
Expand Down