Skip to content

Commit 12c9eff

Browse files
DriesSchaumontyeshsurya
authored andcommitted
BUG: Incorrect index shape when using a user-defined function for aggregating a grouped series with object-typed index. (pandas-dev#40835)
1 parent 482141c commit 12c9eff

File tree

1 file changed

+39
-45
lines changed

1 file changed

+39
-45
lines changed

pandas/_libs/reduction.pyx

+39-45
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ from pandas._libs.lib import (
2727
)
2828

2929

30-
cpdef check_result_array(object obj):
30+
cpdef check_result_array(object obj, Py_ssize_t cnt):
3131

3232
if (is_array(obj) or
33-
(isinstance(obj, list) and len(obj) == 0) or
34-
getattr(obj, 'shape', None) == (0,)):
33+
(isinstance(obj, list) and len(obj) == cnt) or
34+
getattr(obj, 'shape', None) == (cnt,)):
3535
raise ValueError('Must produce aggregated value')
3636

3737

@@ -53,43 +53,45 @@ cdef class _BaseGrouper:
5353

5454
return values, index
5555

56-
cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider):
57-
"""
58-
Create Series and Index objects that we will alter in-place while iterating.
59-
"""
60-
cached_index = self.ityp(islider.buf, dtype=self.idtype)
61-
cached_series = self.typ(
62-
vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name
63-
)
64-
return cached_index, cached_series
65-
66-
cdef inline _update_cached_objs(self, object cached_series, object cached_index,
56+
cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp,
6757
Slider islider, Slider vslider):
68-
# See the comment in indexes/base.py about _index_data.
69-
# We need this for EA-backed indexes that have a reference
70-
# to a 1-d ndarray like datetime / timedelta / period.
71-
cached_index._engine.clear_mapping()
72-
cached_index._cache.clear() # e.g. inferred_freq must go
73-
cached_series._mgr.set_values(vslider.buf)
58+
if cached_typ is None:
59+
cached_ityp = self.ityp(islider.buf, dtype=self.idtype)
60+
cached_typ = self.typ(
61+
vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name
62+
)
63+
else:
64+
# See the comment in indexes/base.py about _index_data.
65+
# We need this for EA-backed indexes that have a reference
66+
# to a 1-d ndarray like datetime / timedelta / period.
67+
object.__setattr__(cached_ityp, '_index_data', islider.buf)
68+
cached_ityp._engine.clear_mapping()
69+
cached_ityp._cache.clear() # e.g. inferred_freq must go
70+
cached_typ._mgr.set_values(vslider.buf)
71+
object.__setattr__(cached_typ, '_index', cached_ityp)
72+
object.__setattr__(cached_typ, 'name', self.name)
73+
return cached_typ, cached_ityp
7474

7575
cdef inline object _apply_to_group(self,
76-
object cached_series, object cached_index,
76+
object cached_typ, object cached_ityp,
7777
bint initialized):
7878
"""
7979
Call self.f on our new group, then update to the next group.
8080
"""
8181
cdef:
8282
object res
8383

84-
# NB: we assume that _update_cached_objs has already cleared cleared
85-
# the cache and engine mapping
86-
res = self.f(cached_series)
84+
cached_ityp._engine.clear_mapping()
85+
cached_ityp._cache.clear() # e.g. inferred_freq must go
86+
res = self.f(cached_typ)
8787
res = extract_result(res)
8888
if not initialized:
8989
# On the first pass, we check the output shape to see
9090
# if this looks like a reduction.
9191
initialized = True
92-
check_result_array(res)
92+
# In all tests other than test_series_grouper and
93+
# test_series_bin_grouper, we have len(self.dummy_arr) == 0
94+
check_result_array(res, len(self.dummy_arr))
9395

9496
return res, initialized
9597

@@ -140,7 +142,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
140142
object res
141143
bint initialized = 0
142144
Slider vslider, islider
143-
object cached_series = None, cached_index = None
145+
object cached_typ = None, cached_ityp = None
144146

145147
counts = np.zeros(self.ngroups, dtype=np.int64)
146148

@@ -160,10 +162,6 @@ cdef class SeriesBinGrouper(_BaseGrouper):
160162

161163
result = np.empty(self.ngroups, dtype='O')
162164

163-
cached_index, cached_series = self._init_dummy_series_and_index(
164-
islider, vslider
165-
)
166-
167165
start = 0
168166
try:
169167
for i in range(self.ngroups):
@@ -173,10 +171,10 @@ cdef class SeriesBinGrouper(_BaseGrouper):
173171
islider.move(start, end)
174172
vslider.move(start, end)
175173

176-
self._update_cached_objs(
177-
cached_series, cached_index, islider, vslider)
174+
cached_typ, cached_ityp = self._update_cached_objs(
175+
cached_typ, cached_ityp, islider, vslider)
178176

179-
res, initialized = self._apply_to_group(cached_series, cached_index,
177+
res, initialized = self._apply_to_group(cached_typ, cached_ityp,
180178
initialized)
181179
start += group_size
182180

@@ -238,7 +236,7 @@ cdef class SeriesGrouper(_BaseGrouper):
238236
object res
239237
bint initialized = 0
240238
Slider vslider, islider
241-
object cached_series = None, cached_index = None
239+
object cached_typ = None, cached_ityp = None
242240

243241
labels = self.labels
244242
counts = np.zeros(self.ngroups, dtype=np.int64)
@@ -250,10 +248,6 @@ cdef class SeriesGrouper(_BaseGrouper):
250248

251249
result = np.empty(self.ngroups, dtype='O')
252250

253-
cached_index, cached_series = self._init_dummy_series_and_index(
254-
islider, vslider
255-
)
256-
257251
start = 0
258252
try:
259253
for i in range(n):
@@ -271,10 +265,10 @@ cdef class SeriesGrouper(_BaseGrouper):
271265
islider.move(start, end)
272266
vslider.move(start, end)
273267

274-
self._update_cached_objs(
275-
cached_series, cached_index, islider, vslider)
268+
cached_typ, cached_ityp = self._update_cached_objs(
269+
cached_typ, cached_ityp, islider, vslider)
276270

277-
res, initialized = self._apply_to_group(cached_series, cached_index,
271+
res, initialized = self._apply_to_group(cached_typ, cached_ityp,
278272
initialized)
279273

280274
start += group_size
@@ -297,20 +291,20 @@ cdef class SeriesGrouper(_BaseGrouper):
297291
return result, counts
298292

299293

300-
cpdef inline extract_result(object res):
294+
cpdef inline extract_result(object res, bint squeeze=True):
301295
""" extract the result object, it might be a 0-dim ndarray
302296
or a len-1 0-dim, or a scalar """
303297
if hasattr(res, "_values"):
304298
# Preserve EA
305299
res = res._values
306-
if res.ndim == 1 and len(res) == 1:
300+
if squeeze and res.ndim == 1 and len(res) == 1:
307301
res = res[0]
308302
if hasattr(res, 'values') and is_array(res.values):
309303
res = res.values
310304
if is_array(res):
311305
if res.ndim == 0:
312306
res = res.item()
313-
elif res.ndim == 1 and len(res) == 1:
307+
elif squeeze and res.ndim == 1 and len(res) == 1:
314308
res = res[0]
315309
return res
316310

@@ -495,6 +489,6 @@ cdef class BlockSlider:
495489
Ensure that we have the original blocks, blknos, and blklocs.
496490
"""
497491
mgr = self.dummy._mgr
498-
mgr.blocks = tuple(self.blocks)
492+
mgr.blocks = self.blocks
499493
mgr._blklocs = self.orig_blklocs
500494
mgr._blknos = self.orig_blknos

0 commit comments

Comments
 (0)