Skip to content

Commit d7ff4e6

Browse files
jbrockmendelhweecat
authored andcommitted
PERF: perform reductions block-wise (pandas-dev#29847)
1 parent f4d3806 commit d7ff4e6

File tree

4 files changed

+48
-2
lines changed

4 files changed

+48
-2
lines changed

pandas/core/frame.py

+20
Original file line numberDiff line numberDiff line change
@@ -7746,6 +7746,26 @@ def _get_data(axis_matters):
77467746
raise NotImplementedError(msg)
77477747
return data
77487748

7749+
if numeric_only is not None and axis in [0, 1]:
7750+
df = self
7751+
if numeric_only is True:
7752+
df = _get_data(axis_matters=True)
7753+
if axis == 1:
7754+
df = df.T
7755+
axis = 0
7756+
7757+
out_dtype = "bool" if filter_type == "bool" else None
7758+
7759+
# After possibly _get_data and transposing, we are now in the
7760+
# simple case where we can use BlockManager._reduce
7761+
res = df._data.reduce(op, axis=1, skipna=skipna, **kwds)
7762+
assert isinstance(res, dict)
7763+
if len(res):
7764+
assert len(res) == max(list(res.keys())) + 1, res.keys()
7765+
out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
7766+
out.index = df.columns
7767+
return out
7768+
77497769
if numeric_only is None:
77507770
values = self.values
77517771
try:

pandas/core/internals/managers.py

+26
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,32 @@ def _verify_integrity(self):
340340
f"tot_items: {tot_items}"
341341
)
342342

343+
def reduce(self, func, *args, **kwargs):
344+
# If 2D, we assume that we're operating column-wise
345+
if self.ndim == 1:
346+
# we'll be returning a scalar
347+
blk = self.blocks[0]
348+
return func(blk.values, *args, **kwargs)
349+
350+
res = {}
351+
for blk in self.blocks:
352+
bres = func(blk.values, *args, **kwargs)
353+
354+
if np.ndim(bres) == 0:
355+
# EA
356+
assert blk.shape[0] == 1
357+
new_res = zip(blk.mgr_locs.as_array, [bres])
358+
else:
359+
assert bres.ndim == 1, bres.shape
360+
assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs)
361+
new_res = zip(blk.mgr_locs.as_array, bres)
362+
363+
nr = dict(new_res)
364+
assert not any(key in res for key in nr)
365+
res.update(nr)
366+
367+
return res
368+
343369
def apply(self, f, filter=None, **kwargs):
344370
"""
345371
Iterate over the blocks, collect and create a new BlockManager.

pandas/core/nanops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None):
831831
try:
832832
result = getattr(values, meth)(axis, dtype=dtype_max)
833833
result.fill(np.nan)
834-
except (AttributeError, TypeError, ValueError, np.core._internal.AxisError):
834+
except (AttributeError, TypeError, ValueError):
835835
result = np.nan
836836
else:
837837
result = getattr(values, meth)(axis)

pandas/tests/groupby/test_groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ def test_omit_nuisance(df):
771771

772772
# won't work with axis = 1
773773
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
774-
msg = r"unsupported operand type\(s\) for \+: 'Timestamp'"
774+
msg = "reduction operation 'sum' not allowed for this dtype"
775775
with pytest.raises(TypeError, match=msg):
776776
grouped.agg(lambda x: x.sum(0, numeric_only=False))
777777

0 commit comments

Comments
 (0)