Skip to content

CLN: deduplicate __setitem__ and _reduce on masked arrays #34187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 4 additions & 35 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,13 @@
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
from pandas.core.indexers import check_array_indexer
from pandas.core import ops

from .masked import BaseMaskedArray, BaseMaskedDtype

Expand Down Expand Up @@ -347,19 +344,8 @@ def reconstruct(x):
else:
return reconstruct(result)

def __setitem__(self, key, value) -> None:
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
value, mask = coerce_to_array(value)

if _is_scalar:
value = value[0]
mask = mask[0]

key = check_array_indexer(self, key)
self._data[key] = value
self._mask[key] = mask
def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value)

def astype(self, dtype, copy: bool = True) -> ArrayLike:
"""
Expand Down Expand Up @@ -670,24 +656,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in {"any", "all"}:
return getattr(self, name)(skipna=skipna, **kwargs)

data = self._data
mask = self._mask

if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
if self._hasna:
data = self.to_numpy("float64", na_value=np.nan)

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

if np.isnan(result):
return libmissing.NA

return result
return super()._reduce(name, skipna, **kwargs)

def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Expand Down
40 changes: 3 additions & 37 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna

from pandas.core import nanops, ops
from pandas.core import ops
from pandas.core.array_algos import masked_reductions
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric
Expand Down Expand Up @@ -417,19 +415,8 @@ def reconstruct(x):
else:
return reconstruct(result)

def __setitem__(self, key, value) -> None:
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
value, mask = coerce_to_array(value, dtype=self.dtype)

if _is_scalar:
value = value[0]
mask = mask[0]

key = check_array_indexer(self, key)
self._data[key] = value
self._mask[key] = mask
def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=self.dtype)

def astype(self, dtype, copy: bool = True) -> ArrayLike:
"""
Expand Down Expand Up @@ -551,27 +538,6 @@ def cmp_method(self, other):
name = f"__{op.__name__}__"
return set_function_name(cmp_method, name, cls)

def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
if self._hasna:
data = self.to_numpy("float64", na_value=np.nan)

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

if np.isnan(result):
return libmissing.NA

return result

def sum(self, skipna=True, min_count=0, **kwargs):
nv.validate_sum((), kwargs)
result = masked_reductions.sum(
Expand Down
47 changes: 46 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,17 @@
from pandas.util._decorators import doc

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
from pandas.core.dtypes.common import (
is_integer,
is_object_dtype,
is_scalar,
is_string_dtype,
)
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops
from pandas.core.algorithms import _factorize_array, take
from pandas.core.array_algos import masked_reductions
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.indexers import check_array_indexer

Expand Down Expand Up @@ -77,6 +84,23 @@ def __getitem__(self, item):

return type(self)(self._data[item], self._mask[item])

def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
raise AbstractMethodError(self)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

orthogranal to this PR but wondering if we even need this any more or could just use abc.abstractmethod decorator

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We generally use this instead of the decorator in the EA base class, because the decorator requires it to actually be an abstract base class.

See

This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
provided for registering virtual subclasses.


def __setitem__(self, key, value) -> None:
_is_scalar = is_scalar(value)
if _is_scalar:
value = [value]
value, mask = self._coerce_to_array(value)

if _is_scalar:
value = value[0]
mask = mask[0]

key = check_array_indexer(self, key)
self._data[key] = value
self._mask[key] = mask

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
Expand Down Expand Up @@ -305,3 +329,24 @@ def value_counts(self, dropna: bool = True) -> "Series":
counts = IntegerArray(counts, mask)

return Series(counts, index=index)

def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
if self._hasna:
data = self.to_numpy("float64", na_value=np.nan)

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

if np.isnan(result):
return libmissing.NA
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a follow up, should this be self.dtype.na_value instead of hardcoded in the base class.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally like having it explicit as well (all subclasses use NA as missing value indicator, this method is not shared with ones where dtype.na_value is something different), but can go either way

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm,

we have

    @property
    def _na_value(self):
        return self.dtype.na_value

so this is not consistent.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also remove that property, from a quick search, that is only used once in the arrays code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, if not generally on an array class, I think self.dtype.na_value is generally clearer.

This is not the only case where libmissing.NA is used, so we should use one ore the other. I accept your argument about being explicit but also prefer the the case of allowing the class to be extensible. YAGNI favours your proposal.


return result