Skip to content

BUG: Incorrect index shape when using a user-defined function for aggregating a grouped series with object-typed index. #40835

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Apr 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5ded786
Add failing test.
DriesSchaumont Apr 2, 2021
30cd16e
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 7, 2021
e97b413
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
c5e1cf7
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
9ccb324
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
04d61b5
Dont use fastpath for series whith object index.
DriesSchaumont Apr 8, 2021
d4d9fb5
Add comment.
DriesSchaumont Apr 9, 2021
307bbe5
Add whatsnew
DriesSchaumont Apr 9, 2021
ace5f81
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
bb7f09e
Move test.
DriesSchaumont Apr 9, 2021
4f31837
Check test expected result.
DriesSchaumont Apr 9, 2021
db9b29b
Move whatsnew
DriesSchaumont Apr 9, 2021
fa4291a
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
b0dfbcd
Adjustment for review
DriesSchaumont Apr 9, 2021
4ddcc12
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
a21be6f
New solution.
DriesSchaumont Apr 12, 2021
7a4a793
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 12, 2021
4bf3f20
Fix len error.
DriesSchaumont Apr 12, 2021
0c16e6c
Fix index caching.
DriesSchaumont Apr 12, 2021
f457dff
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 12, 2021
3eb7b79
Fix index caching (2)
DriesSchaumont Apr 13, 2021
d608b5b
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
86db870
Styling and remove stray print statement
DriesSchaumont Apr 13, 2021
ded8433
Adjustments for review
DriesSchaumont Apr 13, 2021
24a1344
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
5cca8d2
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
337edbd
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 14, 2021
b89eee0
Add issue number, change almost_equal to assert_equal
DriesSchaumont Apr 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,7 @@ Groupby/resample/rolling
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)

Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ cdef class _BaseGrouper:
cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
Slider islider, Slider vslider):
if cached_typ is None:
cached_ityp = self.ityp(islider.buf)
cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
cached_typ = self.typ(
vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
)
Expand All @@ -70,7 +70,6 @@ cdef class _BaseGrouper:
cached_typ._mgr.set_values(vslider.buf)
object.__setattr__(cached_typ, '_index', cached_ityp)
object.__setattr__(cached_typ, 'name', self.name)

return cached_typ, cached_ityp

cdef inline object _apply_to_group(self,
Expand Down Expand Up @@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object values, f, bins, typ, ityp, name
object values, f, bins, typ, ityp, name, idtype

def __init__(self, object series, object f, object bins):

Expand All @@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down Expand Up @@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object f, labels, values, typ, ityp, name
object f, labels, values, typ, ityp, name, idtype

def __init__(self, object series, object f, ndarray[intp_t] labels,
Py_ssize_t ngroups):
Expand All @@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1221,3 +1221,18 @@ def test_aggregate_numeric_object_dtype():
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)


def test_groupby_index_object_dtype():
# GH 40014
df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
df.index = df.index.astype("O")
grouped = df.groupby(["c0", "c1"])
res = grouped.p.agg(lambda x: all(x > 0))
# Check that providing a user-defined function in agg()
# produces the correct index shape when using an object-typed index.
expected_index = MultiIndex.from_tuples(
[("x", "x"), ("x", "y")], names=("c0", "c1")
)
expected = Series([False, True], index=expected_index, name="p")
tm.assert_series_equal(res, expected)
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ def test_series_grouper():
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_result_length_difference():
# GH 40014
obj = Series(np.random.randn(10), dtype="float64")
obj.index = obj.index.astype("O")
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
result, counts = grouper.get_result()

expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)])
tm.assert_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty_raises():
# GH#29500
obj = Series(np.random.randn(10))
Expand Down