Skip to content

Commit a9ca0fa

Browse files
ENH/PERF: enable column-wise reductions for EA-backed columns
1 parent 6812842 commit a9ca0fa

File tree

4 files changed

+83
-1
lines changed

4 files changed

+83
-1
lines changed

pandas/core/frame.py

+65
Original file line numberDiff line numberDiff line change
@@ -7852,6 +7852,23 @@ def _count_level(self, level, axis=0, numeric_only=False):
78527852
def _reduce(
78537853
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
78547854
):
7855+
"""
7856+
Reduce DataFrame over axis with given operation.
7857+
7858+
Parameters
7859+
----------
7860+
op : func
7861+
The reducing function to be called on the values.
7862+
name : str
7863+
The name of the reduction.
7864+
axis : int
7865+
numeric_only : bool, optional
7866+
filter_type : None or "bool"
7867+
Set to "bool" for ops that give boolean results.
7868+
skipna, **kwds : keywords to pass to the `op` function
7869+
7870+
"""
7871+
column_wise = kwds.pop("column_wise", False)
78557872

78567873
assert filter_type is None or filter_type == "bool", filter_type
78577874

@@ -7898,6 +7915,13 @@ def _get_data(axis_matters):
78987915
raise NotImplementedError(msg)
78997916
return data
79007917

7918+
if axis == 0 and column_wise:
7919+
# column-wise reduction
7920+
df = self
7921+
if numeric_only is True:
7922+
df = _get_data(axis_matters=True)
7923+
return DataFrame._reduce_columns(df, op, name, skipna=skipna, **kwds)
7924+
79017925
if numeric_only is not None and axis in [0, 1]:
79027926
df = self
79037927
if numeric_only is True:
@@ -7994,6 +8018,47 @@ def blk_func(values):
79948018
result = self._constructor_sliced(result, index=labels)
79958019
return result
79968020

8021+
def _reduce_columns(self, op, name, skipna=True, **kwds):
8022+
"""
8023+
Reduce DataFrame column-wise.
8024+
8025+
Parameters
8026+
----------
8027+
op : func
8028+
The reducing function to be called on the values. Only used
8029+
for columns backed by a numpy ndarray.
8030+
name : str
8031+
The name of the reduction.
8032+
skipna, **kwds : keywords to pass to the `op` function
8033+
8034+
Returns
8035+
-------
8036+
Series
8037+
"""
8038+
result = []
8039+
8040+
for arr in self._iter_arrays():
8041+
if isinstance(arr, ExtensionArray):
8042+
# dispatch to ExtensionArray interface
8043+
val = arr._reduce(name, skipna=skipna, **kwds)
8044+
else:
8045+
# dispatch to numpy arrays
8046+
with np.errstate(all="ignore"):
8047+
val = op(arr, skipna=skipna, **kwds)
8048+
8049+
result.append(val)
8050+
8051+
return self._constructor_sliced(result, index=self.columns)
8052+
8053+
def _iter_arrays(self):
8054+
"""
8055+
Iterate over the arrays of all columns in order.
8056+
8057+
This returns the values as stored in the Block (ndarray or ExtensionArray).
8058+
"""
8059+
for i in range(len(self.columns)):
8060+
yield self._data.iget_values(i)
8061+
79978062
def nunique(self, axis=0, dropna=True) -> Series:
79988063
"""
79998064
Count distinct observations over requested axis.

pandas/core/generic.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -11067,6 +11067,7 @@ def stat_func(
1106711067
min_count=0,
1106811068
**kwargs,
1106911069
):
11070+
column_wise = kwargs.pop("column_wise", False)
1107011071
if name == "sum":
1107111072
nv.validate_sum(tuple(), kwargs)
1107211073
elif name == "prod":
@@ -11088,6 +11089,7 @@ def stat_func(
1108811089
skipna=skipna,
1108911090
numeric_only=numeric_only,
1109011091
min_count=min_count,
11092+
column_wise=column_wise,
1109111093
)
1109211094

1109311095
return set_function_name(stat_func, name, cls)
@@ -11117,6 +11119,7 @@ def _make_stat_function(
1111711119
def stat_func(
1111811120
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
1111911121
):
11122+
column_wise = kwargs.pop("column_wise", False)
1112011123
if name == "median":
1112111124
nv.validate_median(tuple(), kwargs)
1112211125
else:
@@ -11128,7 +11131,12 @@ def stat_func(
1112811131
if level is not None:
1112911132
return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
1113011133
return self._reduce(
11131-
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11134+
func,
11135+
name=name,
11136+
axis=axis,
11137+
skipna=skipna,
11138+
numeric_only=numeric_only,
11139+
column_wise=column_wise,
1113211140
)
1113311141

1113411142
return set_function_name(stat_func, name, cls)

pandas/core/internals/managers.py

+8
Original file line numberDiff line numberDiff line change
@@ -998,6 +998,14 @@ def iget(self, i: int) -> "SingleBlockManager":
998998
fastpath=True,
999999
)
10001000

1001+
def iget_values(self, i: int):
1002+
"""
1003+
Return the data for column i as the values (ndarray or ExtensionArray).
1004+
"""
1005+
block = self.blocks[self.blknos[i]]
1006+
values = block.iget(self.blklocs[i])
1007+
return values
1008+
10011009
def delete(self, item):
10021010
"""
10031011
Delete selected item (items if non-unique) in-place.

pandas/core/series.py

+1
Original file line numberDiff line numberDiff line change
@@ -3871,6 +3871,7 @@ def _reduce(
38713871
If we have an ndarray as a value, then simply perform the operation,
38723872
otherwise delegate to the object.
38733873
"""
3874+
kwds.pop("column_wise", None)
38743875
delegate = self._values
38753876

38763877
if axis is not None:

0 commit comments

Comments
 (0)