Skip to content

EA: BoolArray #25415

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,33 @@ def all_arithmetic_operators(request):
return request.param


_all_numeric_reductions = ['sum', 'max', 'min',
'mean', 'prod', 'std', 'var', 'median',
'kurt', 'skew']
# reductions that are generally applicable to all data types
_non_numeric_reductions = ['min', 'max', 'sum']

# reductions that are generally application to
# only numeric data dtypes
_numeric_reductions = ['mean', 'prod',
'std', 'var', 'median',
'kurt', 'skew']

@pytest.fixture(params=_all_numeric_reductions)

@pytest.fixture(params=_non_numeric_reductions)
def only_non_numeric_reductions(request):
"""
Fixture for only non numeric reduction names
"""
return request.param


@pytest.fixture(params=_numeric_reductions)
def only_numeric_reductions(request):
"""
Fixture for only numeric reduction names
"""
return request.param


@pytest.fixture(params=_non_numeric_reductions + _numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
Expand Down
14 changes: 11 additions & 3 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core import nanops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.arrays.mask import get_mask_array_type
from pandas.core.tools.numeric import to_numeric


Expand Down Expand Up @@ -287,7 +288,7 @@ def __init__(self, values, mask, copy=False):
and is_integer_dtype(values.dtype)):
raise TypeError("values should be integer numpy array. Use "
"the 'integer_array' function instead")
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
if not is_bool_dtype(mask):
raise TypeError("mask should be boolean numpy array. Use "
"the 'integer_array' function instead")

Expand All @@ -296,7 +297,7 @@ def __init__(self, values, mask, copy=False):
mask = mask.copy()

self._data = values
self._mask = mask
self._mask = get_mask_array_type()._from_sequence(mask, copy=False)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
Expand Down Expand Up @@ -332,7 +333,8 @@ def _coerce_to_ndarray(self):

# TODO(jreback) make this better
data = self._data.astype(object)
data[self._mask] = self._na_value
mask = np.array(self._mask, copy=False)
data[mask] = self._na_value
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand Down Expand Up @@ -407,6 +409,11 @@ def nbytes(self):
def isna(self):
return self._mask

@property
def flags(self):
# compat
return self._data.flags

@property
def _na_value(self):
return np.nan
Expand Down Expand Up @@ -559,6 +566,7 @@ def cmp_method(self, other):
else:
mask = self._mask | mask

mask = np.array(mask, copy=False)
result[mask] = op_name == 'ne'
return result

Expand Down
30 changes: 30 additions & 0 deletions pandas/core/arrays/mask/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
_MaskArrayType = None


def get_mask_array_type():
"""Set the mask array type to use, we need to do
this after all modules are imported as the implementations
e.g. pyarrow depend on pandas being importable
"""
global _MaskArrayType

if _MaskArrayType is not None:
return _MaskArrayType

# if ArrowBoolArray is available use it
# otherwise use the NumpyMask
try:
from pandas.core.arrays.mask._pyarrow import ArrowMaskArray

MaskArray = ArrowMaskArray

except ImportError:
from pandas.core.arrays.mask._numpy import NumpyMaskArray

MaskArray = NumpyMaskArray

_MaskArrayType = MaskArray
return _MaskArrayType


__all__ = ['get_mask_array_type']
140 changes: 140 additions & 0 deletions pandas/core/arrays/mask/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""A boolean mask interface.

This module provides an interface to a numpy / pyarrow boolean mask.
This is limited as not all of the implementations can hold NA, so
for consistency this is an internal.
"""

import copy

import numpy as np

from pandas.api.extensions import ExtensionDtype
from pandas.api.types import is_scalar
from pandas.core.arrays.base import ExtensionArray
from pandas.core.missing import isna


class MaskDtype(ExtensionDtype):

type = np.bool_
kind = 'b'
name = 'bool'

@classmethod
def construct_from_string(cls, string):
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))

def _is_boolean(self):
return True

def __hash__(self):
return hash(str(self))

def __eq__(self, other):
# compare == to np.dtype('bool')
if isinstance(other, str):
return other == self.name
elif isinstance(other, type(self)):
return True
elif isinstance(other, np.dtype):
return other == 'bool'
else:
return hash(self) == hash(other)


class MaskArray(ExtensionArray):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More of a question than anything else but is there a reason for defining a lot of the methods in this base class rather than in subclasses? Not terribly familiar with pyarrow yet but would the goal not be to decouple that from numpy here in the long run?

"""Common baseclass for both pyarrow and numpy masked arrays"""
_typ = "maskarray"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this make isinstance(BoolArray(), ABCExtensionArray) false?


@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return cls.from_scalars(scalars)

@property
def size(self):
return len(self)

def __eq__(self, other):
return np.array(self, copy=False) == np.array(other, copy=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be good to keep a list of things requiring a cast to NumPy at the top of this file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And perhaps make JIRAs for them as well.

This, isnull(), unary / binops, reductions.


def __len__(self):
return len(self._data)

def isna(self):
nas = isna(np.array(self._data, copy=False))
return type(self).from_scalars(nas)

def __invert__(self):
return type(self).from_scalars(
~np.array(self._data, copy=False)
)

def __or__(self, other):
return type(self).from_scalars(np.array(
self, copy=False).__or__(np.array(other, copy=False)))

def __ior__(self, other):
return type(self).from_scalars(
np.array(self, copy=False) | np.array(other, copy=False))

def __and__(self, other):
return type(self).from_scalars(
np.array(self, copy=False).__and__(np.array(other, copy=False)))

def __iand__(self, other):
return type(self).from_scalars(
np.array(self, copy=False) & (np.array(other, copy=False)))

def __getitem__(self, item):
arr = np.array(self, copy=False)
if is_scalar(item):
return arr[item]
else:
arr = arr[item]
return type(self).from_scalars(arr)

def view(self, dtype=None):
arr = np.array(self._data, copy=False)
if dtype is not None:
arr = arr.view(dtype=dtype)
return arr

def sum(self, axis=None, min_count=None):
return np.array(self, copy=False).sum()

def copy(self, deep=False):
if deep:
return type(self)(copy.deepcopy(self._data))
else:
return type(self)(copy.copy(self._data))

def any(self, axis=0, out=None):
return np.array(self._data, copy=False).any()

def all(self, axis=0, out=None):
return np.array(self._data, copy=False).all()

def min(self, axis=0, out=None):
return np.array(self._data, copy=False).min()

def max(self, axis=0, out=None):
return np.array(self._data, copy=False).max()

def _reduce(self, method, skipna=True, **kwargs):
if skipna:
arr = self[~self.isna()]
else:
arr = self
# we only allow explicity defined methods
# ndarrays actually support: mean, var, prod, min, max
try:
op = getattr(arr, method)
return op()
except AttributeError:
pass
raise TypeError
82 changes: 82 additions & 0 deletions pandas/core/arrays/mask/_numpy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
This module provide a numpy-boolean boolean array
"""

import numpy as np

from pandas.api.extensions import take
from pandas.core.arrays.mask._base import MaskArray, MaskDtype


class NumpyMaskDtype(MaskDtype):

na_value = np.nan

@classmethod
def construct_array_type(cls):
return NumpyMaskArray


class NumpyMaskArray(MaskArray):
"""Generic class which can be used to represent missing data.
"""

dtype = NumpyMaskDtype()

@classmethod
def from_scalars(cls, values):
arr = np.asarray(values).astype(np.bool_, copy=False)
return cls(arr, copy=False)

def __init__(self, mask, copy=True):
"""
Parameters
----------
mask : numpy array
Mask of missing values.
"""
assert isinstance(mask, np.ndarray)
assert mask.dtype == np.bool_

if copy:
mask = mask.copy()
self._data = mask

def __setitem__(self, key, value):
self._data[key] = value

def __array__(self, dtype=None):
return self._data

def __iter__(self):
return iter(self._data)

@property
def nbytes(self):
return self._data.nbytes

def reshape(self, shape, **kwargs):
return np.array(self, copy=False).reshape(shape, **kwargs)

def astype(self, dtype, copy=True):
# needed to fix this astype for the Series constructor.
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
return super(NumpyMaskArray, self).astype(dtype, copy)

def take(self, indices, allow_fill=False, fill_value=None, axis=None):
# TODO: had to add axis here
data = self._data

if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indices, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result, dtype=self.dtype)

def _concat_same_type(cls, to_concat):
concat = np.concatenate(to_concat)
return cls.from_scalars(concat)
Loading