Skip to content

PERF: masked ops for reductions (min/max) #33261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).


.. ---------------------------------------------------------------------------
Expand Down
41 changes: 41 additions & 0 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,44 @@ def sum(
return np.sum(values[~mask])
else:
return np.sum(values, where=~mask)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Parameters
----------
func : np.min or np.max
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
skipna : bool, default True
Whether to skip NA.
"""
if not skipna:
if mask.any():
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
subset = values[~mask]
if subset.size:
return func(values[~mask])
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA


def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
return _minmax(np.min, values=values, mask=mask, skipna=skipna)


def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)
8 changes: 3 additions & 5 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,8 +696,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name == "sum":
return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
if name in {"sum", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
if self._hasna:
Expand All @@ -715,9 +716,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if int_result == result:
result = int_result

elif name in ["min", "max"] and notna(result):
result = np.bool_(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name == "sum":
return masked_reductions.sum(data, mask, skipna=skipna, **kwargs)
if name in {"sum", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
Expand All @@ -581,7 +582,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ["min", "max", "prod"]:
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op == "sum":
if op in {"sum", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down
62 changes: 45 additions & 17 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,27 +65,58 @@ def test_ops(self, opname, obj):
assert result.value == expected

@pytest.mark.parametrize("opname", ["max", "min"])
def test_nanops(self, opname, index_or_series):
@pytest.mark.parametrize(
"dtype, val",
[
("object", 2.0),
("float64", 2.0),
("datetime64[ns]", datetime(2011, 11, 1)),
("Int64", 2),
("boolean", True),
],
)
def test_nanminmax(self, opname, dtype, val, index_or_series):
# GH#7261
klass = index_or_series
arg_op = "arg" + opname if klass is Index else "idx" + opname

obj = klass([np.nan, 2.0])
assert getattr(obj, opname)() == 2.0
if dtype in ["Int64", "boolean"] and klass == pd.Index:
pytest.skip("EAs can't yet be stored in an index")

obj = klass([np.nan])
assert pd.isna(getattr(obj, opname)())
assert pd.isna(getattr(obj, opname)(skipna=False))
def check_missing(res):
if dtype == "datetime64[ns]":
return res is pd.NaT
elif dtype == "Int64":
return res is pd.NA
else:
return pd.isna(res)

obj = klass([], dtype=object)
assert pd.isna(getattr(obj, opname)())
assert pd.isna(getattr(obj, opname)(skipna=False))
obj = klass([None], dtype=dtype)
assert check_missing(getattr(obj, opname)())
assert check_missing(getattr(obj, opname)(skipna=False))

obj = klass([pd.NaT, datetime(2011, 11, 1)])
# check DatetimeIndex monotonic path
assert getattr(obj, opname)() == datetime(2011, 11, 1)
assert getattr(obj, opname)(skipna=False) is pd.NaT
obj = klass([], dtype=dtype)
assert check_missing(getattr(obj, opname)())
assert check_missing(getattr(obj, opname)(skipna=False))

if dtype == "object":
# generic test with object only works for empty / all NaN
return

obj = klass([None, val], dtype=dtype)
assert getattr(obj, opname)() == val
assert check_missing(getattr(obj, opname)(skipna=False))

obj = klass([None, val, None], dtype=dtype)
assert getattr(obj, opname)() == val
assert check_missing(getattr(obj, opname)(skipna=False))

@pytest.mark.parametrize("opname", ["max", "min"])
def test_nanargminmax(self, opname, index_or_series):
# GH#7261
klass = index_or_series
arg_op = "arg" + opname if klass is Index else "idx" + opname

obj = klass([pd.NaT, datetime(2011, 11, 1)])
assert getattr(obj, arg_op)() == 1
result = getattr(obj, arg_op)(skipna=False)
if klass is Series:
Expand All @@ -95,9 +126,6 @@ def test_nanops(self, opname, index_or_series):

obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT])
# check DatetimeIndex non-monotonic path
assert getattr(obj, opname)(), datetime(2011, 11, 1)
assert getattr(obj, opname)(skipna=False) is pd.NaT

assert getattr(obj, arg_op)() == 1
result = getattr(obj, arg_op)(skipna=False)
if klass is Series:
Expand Down