Skip to content

REF: Implement BaseMaskedArray class for integer/boolean ExtensionArrays #30789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 6 additions & 183 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,19 @@
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer

from .masked import BaseMaskedArray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use absolute imports


if TYPE_CHECKING:
from pandas._typing import Scalar
Expand Down Expand Up @@ -199,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False):
return values, mask


class BooleanArray(ExtensionArray, ExtensionOpsMixin):
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.

Expand Down Expand Up @@ -253,6 +248,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
Length: 3, dtype: boolean
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False

def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
Expand Down Expand Up @@ -297,127 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
def _from_factorized(cls, values, original: "BooleanArray"):
return cls._from_sequence(values, dtype=original.dtype)

def _formatter(self, boxed=False):
return str

@property
def _hasna(self) -> bool:
# Note: this is expensive right now! The hope is that we can
# make this faster by having an optional mask, but not have to change
# source code using it..
return self._mask.any()

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
):
"""
Convert to a NumPy Array.

By default converts to an object-dtype NumPy array. Specify the `dtype` and
`na_value` keywords to customize the conversion.

Parameters
----------
dtype : dtype, default object
The numpy dtype to convert to.
copy : bool, default False
Whether to ensure that the returned value is a not a view on
the array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary. This is typically
only possible when no missing values are present and `dtype`
is a boolean dtype.
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).

Returns
-------
numpy.ndarray

Examples
--------
An object-dtype is the default result

>>> a = pd.array([True, False], dtype="boolean")
>>> a.to_numpy()
array([True, False], dtype=object)

When no missing values are present, a boolean dtype can be used.

>>> a.to_numpy(dtype="bool")
array([ True, False])

However, requesting a bool dtype will raise a ValueError if
missing values are present and the default missing value :attr:`NA`
is used.

>>> a = pd.array([True, False, pd.NA], dtype="boolean")
>>> a
<BooleanArray>
[True, False, NA]
Length: 3, dtype: boolean

>>> a.to_numpy(dtype="bool")
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values

Specify a valid `na_value` instead

>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib.no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object
if self._hasna:
if (
not (is_object_dtype(dtype) or is_string_dtype(dtype))
and na_value is libmissing.NA
):
raise ValueError(
f"cannot convert to '{dtype}'-dtype NumPy array "
"with missing values. Specify an appropriate 'na_value' "
"for this dtype."
)
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
data[self._mask] = na_value
else:
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us

def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
# by default (no dtype specified), return an object array
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
"""
import pyarrow as pa

return pa.array(self._data, mask=self._mask, type=type)

_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
Expand Down Expand Up @@ -465,40 +342,6 @@ def reconstruct(x):
else:
return reconstruct(result)

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
yield self.dtype.na_value
else:
yield self._data[i]

def take(self, indexer, allow_fill=False, fill_value=None):
# we always fill with False internally
# to avoid upcasting
data_fill_value = False if isna(fill_value) else fill_value
result = take(
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
)

mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)

# if we are filling
# we only fill where the indexer is null
# not existing missing values
# TODO(jreback) what if we have a non-na float as a fill value?
if allow_fill and notna(fill_value):
fill_mask = np.asarray(indexer) == -1
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return type(self)(result, mask, copy=False)

def copy(self):
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)

def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
if _is_scalar:
Expand All @@ -512,26 +355,6 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def __len__(self):
return len(self._data)

@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes

def isna(self):
return self._mask

@property
def _na_value(self):
return self._dtype.na_value

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)

def astype(self, dtype, copy=True):
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Expand Down
Loading