Skip to content

BUG: Incorrect index shape when using a user-defined function for aggregating a grouped series with object-typed index. #40835

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Apr 15, 2021
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5ded786
Add failing test.
DriesSchaumont Apr 2, 2021
30cd16e
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 7, 2021
e97b413
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
c5e1cf7
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
9ccb324
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 8, 2021
04d61b5
Dont use fastpath for series whith object index.
DriesSchaumont Apr 8, 2021
d4d9fb5
Add comment.
DriesSchaumont Apr 9, 2021
307bbe5
Add whatsnew
DriesSchaumont Apr 9, 2021
ace5f81
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
bb7f09e
Move test.
DriesSchaumont Apr 9, 2021
4f31837
Check test expected result.
DriesSchaumont Apr 9, 2021
db9b29b
Move whatsnew
DriesSchaumont Apr 9, 2021
fa4291a
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
b0dfbcd
Adjustment for review
DriesSchaumont Apr 9, 2021
4ddcc12
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 9, 2021
a21be6f
New solution.
DriesSchaumont Apr 12, 2021
7a4a793
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 12, 2021
4bf3f20
Fix len error.
DriesSchaumont Apr 12, 2021
0c16e6c
Fix index caching.
DriesSchaumont Apr 12, 2021
f457dff
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 12, 2021
3eb7b79
Fix index caching (2)
DriesSchaumont Apr 13, 2021
d608b5b
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
86db870
Styling and remove stray print statement
DriesSchaumont Apr 13, 2021
ded8433
Adjustments for review
DriesSchaumont Apr 13, 2021
24a1344
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
5cca8d2
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 13, 2021
337edbd
Merge remote-tracking branch 'upstream/master' into fix-40014
DriesSchaumont Apr 14, 2021
b89eee0
Add issue number, change almost_equal to assert_equal
DriesSchaumont Apr 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ Groupby/resample/rolling
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
- Bug in groupby aggregation when using an object-typed :class:`Index` (issue:`40014`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you be more specific here? as a user i am not sure what i am to do with this statement

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a 'for example df.groupby(....)......` or similar, in particular this is only for a user supplied function (e.g. via .apply right)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adjusted, hope this is more clear?

- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)

Reshaping
Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ cdef class _BaseGrouper:
cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
Slider islider, Slider vslider):
if cached_typ is None:
cached_ityp = self.ityp(islider.buf)
cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
cached_typ = self.typ(
vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
)
Expand All @@ -70,7 +70,6 @@ cdef class _BaseGrouper:
cached_typ._mgr.set_values(vslider.buf)
object.__setattr__(cached_typ, '_index', cached_ityp)
object.__setattr__(cached_typ, 'name', self.name)

return cached_typ, cached_ityp

cdef inline object _apply_to_group(self,
Expand Down Expand Up @@ -106,7 +105,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object values, f, bins, typ, ityp, name
object values, f, bins, typ, ityp, name, idtype

def __init__(self, object series, object f, object bins):

Expand All @@ -122,6 +121,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down Expand Up @@ -199,7 +199,7 @@ cdef class SeriesGrouper(_BaseGrouper):

cdef public:
ndarray arr, index, dummy_arr, dummy_index
object f, labels, values, typ, ityp, name
object f, labels, values, typ, ityp, name, idtype

def __init__(self, object series, object f, ndarray[intp_t] labels,
Py_ssize_t ngroups):
Expand All @@ -218,6 +218,7 @@ cdef class SeriesGrouper(_BaseGrouper):
self.arr = values
self.typ = series._constructor
self.ityp = series.index._constructor
self.idtype = series.index.dtype
self.index = series.index.values
self.name = series.name

Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1221,3 +1221,16 @@ def test_aggregate_numeric_object_dtype():
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)


def test_groupby_index_object_dtype():
# GH 40014
df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
df.index = df.index.astype("O")
grouped = df.groupby(["c0", "c1"])
res = grouped.p.agg(lambda x: all(x > 0))
expected_index = MultiIndex.from_tuples(
[("x", "x"), ("x", "y")], names=("c0", "c1")
)
expected = Series([False, True], index=expected_index, name="p")
tm.assert_series_equal(res, expected)
15 changes: 15 additions & 0 deletions pandas/tests/groupby/test_bin_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ def test_series_grouper():
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_result_length_difference():
obj = Series(np.random.randn(10), dtype="float64")
obj.index = obj.index.astype("O")
labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)

grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
result, counts = grouper.get_result()

expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)])
tm.assert_almost_equal(result, expected)

exp_counts = np.array([3, 4], dtype=np.int64)
tm.assert_almost_equal(counts, exp_counts)


def test_series_grouper_requires_nonempty_raises():
# GH#29500
obj = Series(np.random.randn(10))
Expand Down