Skip to content

Commit 5e3a5f6

Browse files
PERF: masked ops for reductions (sum) (#30982)
* POC masked ops for reductions * fix mask for older numpy * also use in boolean * add min_count support * fix preserve_dtypes test * passthrough min_count for boolean as well * fix comment * add object to empty reduction test case * test platform int * Test sum separately with platform int * share min_count checking helper function with nanops * type + add docstring for min_count * move sum algo from ops to array_algos * add Int64/boolean to some benchmarks * add whatsnew * add skipna default in function signature * update type hint + deprivatize * update another type hint
1 parent 9130da9 commit 5e3a5f6

File tree

10 files changed

+139
-34
lines changed

10 files changed

+139
-34
lines changed

asv_bench/benchmarks/series_methods.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self):
223223

224224
class All:
225225

226-
params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
227-
param_names = ["N", "case"]
226+
params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
227+
param_names = ["N", "case", "dtype"]
228228

229-
def setup(self, N, case):
229+
def setup(self, N, case, dtype):
230230
val = case != "fast"
231-
self.s = Series([val] * N)
231+
self.s = Series([val] * N, dtype=dtype)
232232

233-
def time_all(self, N, case):
233+
def time_all(self, N, case, dtype):
234234
self.s.all()
235235

236236

237237
class Any:
238238

239-
params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
240-
param_names = ["N", "case"]
239+
params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]]
240+
param_names = ["N", "case", "dtype"]
241241

242-
def setup(self, N, case):
242+
def setup(self, N, case, dtype):
243243
val = case == "fast"
244-
self.s = Series([val] * N)
244+
self.s = Series([val] * N, dtype=dtype)
245245

246-
def time_any(self, N, case):
246+
def time_any(self, N, case, dtype):
247247
self.s.any()
248248

249249

@@ -265,11 +265,14 @@ class NanOps:
265265
"prod",
266266
],
267267
[10 ** 3, 10 ** 6],
268-
["int8", "int32", "int64", "float64"],
268+
["int8", "int32", "int64", "float64", "Int64", "boolean"],
269269
]
270270
param_names = ["func", "N", "dtype"]
271271

272272
def setup(self, func, N, dtype):
273+
if func == "argmax" and dtype in {"Int64", "boolean"}:
274+
# Skip argmax for nullable int since this doesn't work yet (GH-24382)
275+
raise NotImplementedError
273276
self.s = Series([1] * N, dtype=dtype)
274277
self.func = getattr(self.s, func)
275278

asv_bench/benchmarks/stat_ops.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,17 @@
77

88
class FrameOps:
99

10-
params = [ops, ["float", "int"], [0, 1]]
10+
params = [ops, ["float", "int", "Int64"], [0, 1]]
1111
param_names = ["op", "dtype", "axis"]
1212

1313
def setup(self, op, dtype, axis):
14-
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
14+
if op == "mad" and dtype == "Int64" and axis == 1:
15+
# GH-33036
16+
raise NotImplementedError
17+
values = np.random.randn(100000, 4)
18+
if dtype == "Int64":
19+
values = values.astype(int)
20+
df = pd.DataFrame(values).astype(dtype)
1521
self.df_func = getattr(df, op)
1622

1723
def time_op(self, op, dtype, axis):

doc/source/whatsnew/v1.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ Performance improvements
255255
sparse values from ``scipy.sparse`` matrices using the
256256
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
257257
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
258+
- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
259+
258260

259261
.. ---------------------------------------------------------------------------
260262
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
masked_reductions.py is for reduction algorithms using a mask-based approach
3+
for missing values.
4+
"""
5+
6+
import numpy as np
7+
8+
from pandas._libs import missing as libmissing
9+
from pandas.compat.numpy import _np_version_under1p17
10+
11+
from pandas.core.nanops import check_below_min_count
12+
13+
14+
def sum(
15+
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
16+
):
17+
"""
18+
Sum for 1D masked array.
19+
20+
Parameters
21+
----------
22+
values : np.ndarray
23+
Numpy array with the values (can be of any dtype that support the
24+
operation).
25+
mask : np.ndarray
26+
Boolean numpy array (True values indicate missing values).
27+
skipna : bool, default True
28+
Whether to skip NA.
29+
min_count : int, default 0
30+
The required number of valid values to perform the operation. If fewer than
31+
``min_count`` non-NA values are present the result will be NA.
32+
"""
33+
if not skipna:
34+
if mask.any():
35+
return libmissing.NA
36+
else:
37+
if check_below_min_count(values.shape, None, min_count):
38+
return libmissing.NA
39+
return np.sum(values)
40+
else:
41+
if check_below_min_count(values.shape, mask, min_count):
42+
return libmissing.NA
43+
44+
if _np_version_under1p17:
45+
return np.sum(values[~mask])
46+
else:
47+
return np.sum(values, where=~mask)

pandas/core/arrays/boolean.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from pandas.core.dtypes.missing import isna, notna
2828

2929
from pandas.core import nanops, ops
30+
from pandas.core.array_algos import masked_reductions
3031
from pandas.core.indexers import check_array_indexer
3132

3233
from .masked import BaseMaskedArray
@@ -695,6 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
695696
data = self._data
696697
mask = self._mask
697698

699+
if name == "sum":
700+
return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
701+
698702
# coerce to a nan-aware float if needed
699703
if self._hasna:
700704
data = self.to_numpy("float64", na_value=np.nan)
@@ -706,7 +710,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
706710
return libmissing.NA
707711

708712
# if we have numeric op that would result in an int, coerce to int if possible
709-
if name in ["sum", "prod"] and notna(result):
713+
if name == "prod" and notna(result):
710714
int_result = np.int64(result)
711715
if int_result == result:
712716
result = int_result

pandas/core/arrays/integer.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from pandas.core.dtypes.missing import isna
2828

2929
from pandas.core import nanops, ops
30+
from pandas.core.array_algos import masked_reductions
3031
import pandas.core.common as com
3132
from pandas.core.indexers import check_array_indexer
3233
from pandas.core.ops import invalid_comparison
@@ -560,6 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
560561
data = self._data
561562
mask = self._mask
562563

564+
if name == "sum":
565+
return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
566+
563567
# coerce to a nan-aware float if needed
564568
# (we explicitly use NaN within reductions)
565569
if self._hasna:
@@ -577,7 +581,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
577581

578582
# if we have a preservable numeric op,
579583
# provide coercion back to an integer type if possible
580-
elif name in ["sum", "min", "max", "prod"]:
584+
elif name in ["min", "max", "prod"]:
581585
# GH#31409 more performant than casting-then-checking
582586
result = com.cast_scalar_indexer(result)
583587

pandas/core/nanops.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -1238,7 +1238,7 @@ def _maybe_null_out(
12381238
result: np.ndarray,
12391239
axis: Optional[int],
12401240
mask: Optional[np.ndarray],
1241-
shape: Tuple,
1241+
shape: Tuple[int, ...],
12421242
min_count: int = 1,
12431243
) -> float:
12441244
"""
@@ -1260,16 +1260,43 @@ def _maybe_null_out(
12601260
# GH12941, use None to auto cast null
12611261
result[null_mask] = None
12621262
elif result is not NaT:
1263-
if mask is not None:
1264-
null_mask = mask.size - mask.sum()
1265-
else:
1266-
null_mask = np.prod(shape)
1267-
if null_mask < min_count:
1263+
if check_below_min_count(shape, mask, min_count):
12681264
result = np.nan
12691265

12701266
return result
12711267

12721268

1269+
def check_below_min_count(
1270+
shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int
1271+
):
1272+
"""
1273+
Check for the `min_count` keyword. Returns True if below `min_count` (when
1274+
missing value should be returned from the reduction).
1275+
1276+
Parameters
1277+
----------
1278+
shape : tuple
1279+
The shape of the values (`values.shape`).
1280+
mask : ndarray or None
1281+
Boolean numpy array (typically of same shape as `shape`) or None.
1282+
min_count : int
1283+
Keyword passed through from sum/prod call.
1284+
1285+
Returns
1286+
-------
1287+
bool
1288+
"""
1289+
if min_count > 0:
1290+
if mask is None:
1291+
# no missing values, only check size
1292+
non_nulls = np.prod(shape)
1293+
else:
1294+
non_nulls = mask.size - mask.sum()
1295+
if non_nulls < min_count:
1296+
return True
1297+
return False
1298+
1299+
12731300
def _zero_out_fperr(arg):
12741301
# #18044 reference this behavior to fix rolling skew/kurt issue
12751302
if isinstance(arg, np.ndarray):

pandas/tests/arrays/boolean/test_reduction.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
4646
if dropna:
4747
s = s.dropna()
4848

49-
if op in ("sum", "prod"):
49+
if op == "sum":
50+
assert isinstance(getattr(s, op)(), np.int_)
51+
elif op == "prod":
5052
assert isinstance(getattr(s, op)(), np.int64)
5153
elif op in ("min", "max"):
5254
assert isinstance(getattr(s, op)(), np.bool_)

pandas/tests/arrays/integer/test_dtypes.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ def test_preserve_dtypes(op):
3434

3535
# op
3636
result = getattr(df.C, op)()
37-
assert isinstance(result, int)
37+
if op == "sum":
38+
assert isinstance(result, np.int64)
39+
else:
40+
assert isinstance(result, int)
3841

3942
# groupby
4043
result = getattr(df.groupby("A"), op)()

pandas/tests/reductions/test_reductions.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -531,13 +531,14 @@ def test_sum_inf(self):
531531
res = nanops.nansum(arr, axis=1)
532532
assert np.isinf(res).all()
533533

534+
@pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"])
534535
@pytest.mark.parametrize("use_bottleneck", [True, False])
535536
@pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
536-
def test_empty(self, method, unit, use_bottleneck):
537+
def test_empty(self, method, unit, use_bottleneck, dtype):
537538
with pd.option_context("use_bottleneck", use_bottleneck):
538539
# GH#9422 / GH#18921
539540
# Entirely empty
540-
s = Series([], dtype=object)
541+
s = Series([], dtype=dtype)
541542
# NA by default
542543
result = getattr(s, method)()
543544
assert result == unit
@@ -560,8 +561,14 @@ def test_empty(self, method, unit, use_bottleneck):
560561
result = getattr(s, method)(skipna=True, min_count=1)
561562
assert pd.isna(result)
562563

564+
result = getattr(s, method)(skipna=False, min_count=0)
565+
assert result == unit
566+
567+
result = getattr(s, method)(skipna=False, min_count=1)
568+
assert pd.isna(result)
569+
563570
# All-NA
564-
s = Series([np.nan])
571+
s = Series([np.nan], dtype=dtype)
565572
# NA by default
566573
result = getattr(s, method)()
567574
assert result == unit
@@ -585,7 +592,7 @@ def test_empty(self, method, unit, use_bottleneck):
585592
assert pd.isna(result)
586593

587594
# Mix of valid, empty
588-
s = Series([np.nan, 1])
595+
s = Series([np.nan, 1], dtype=dtype)
589596
# Default
590597
result = getattr(s, method)()
591598
assert result == 1.0
@@ -604,22 +611,22 @@ def test_empty(self, method, unit, use_bottleneck):
604611
result = getattr(s, method)(skipna=True, min_count=0)
605612
assert result == 1.0
606613

607-
result = getattr(s, method)(skipna=True, min_count=1)
608-
assert result == 1.0
609-
610614
# GH#844 (changed in GH#9422)
611-
df = DataFrame(np.empty((10, 0)))
615+
df = DataFrame(np.empty((10, 0)), dtype=dtype)
612616
assert (getattr(df, method)(1) == unit).all()
613617

614-
s = pd.Series([1])
618+
s = pd.Series([1], dtype=dtype)
615619
result = getattr(s, method)(min_count=2)
616620
assert pd.isna(result)
617621

618-
s = pd.Series([np.nan])
622+
result = getattr(s, method)(skipna=False, min_count=2)
623+
assert pd.isna(result)
624+
625+
s = pd.Series([np.nan], dtype=dtype)
619626
result = getattr(s, method)(min_count=2)
620627
assert pd.isna(result)
621628

622-
s = pd.Series([np.nan, 1])
629+
s = pd.Series([np.nan, 1], dtype=dtype)
623630
result = getattr(s, method)(min_count=2)
624631
assert pd.isna(result)
625632

0 commit comments

Comments
 (0)