pandas-dev · jreback · Aug 7, 2018 · Feb 17, 2019 · WillAyd · Feb 25, 2019
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -149,12 +149,33 @@ def all_arithmetic_operators(request):
     return request.param
 
 
-_all_numeric_reductions = ['sum', 'max', 'min',
-                           'mean', 'prod', 'std', 'var', 'median',
-                           'kurt', 'skew']
+# reductions that are generally applicable to all data types
+_non_numeric_reductions = ['min', 'max', 'sum']
 
+# reductions that are generally application to
+# only numeric data dtypes
+_numeric_reductions = ['mean', 'prod',
+                       'std', 'var', 'median',
+                       'kurt', 'skew']
 
-@pytest.fixture(params=_all_numeric_reductions)
+
+@pytest.fixture(params=_non_numeric_reductions)
+def only_non_numeric_reductions(request):
+    """
+    Fixture for only non numeric reduction names
+    """
+    return request.param
+
+
+@pytest.fixture(params=_numeric_reductions)
+def only_numeric_reductions(request):
+    """
+    Fixture for only numeric reduction names
+    """
+    return request.param
+
+
+@pytest.fixture(params=_non_numeric_reductions + _numeric_reductions)
 def all_numeric_reductions(request):
     """
     Fixture for numeric reduction names

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -19,6 +19,7 @@
 
 from pandas.core import nanops
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.arrays.mask import get_mask_array_type
 from pandas.core.tools.numeric import to_numeric
 
 
@@ -287,7 +288,7 @@ def __init__(self, values, mask, copy=False):
                 and is_integer_dtype(values.dtype)):
             raise TypeError("values should be integer numpy array. Use "
                             "the 'integer_array' function instead")
-        if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
+        if not is_bool_dtype(mask):
             raise TypeError("mask should be boolean numpy array. Use "
                             "the 'integer_array' function instead")
 
@@ -296,7 +297,7 @@ def __init__(self, values, mask, copy=False):
             mask = mask.copy()
 
         self._data = values
-        self._mask = mask
+        self._mask = get_mask_array_type()._from_sequence(mask, copy=False)
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
@@ -332,7 +333,8 @@ def _coerce_to_ndarray(self):
 
         # TODO(jreback) make this better
         data = self._data.astype(object)
-        data[self._mask] = self._na_value
+        mask = np.array(self._mask, copy=False)
+        data[mask] = self._na_value
         return data
 
     __array_priority__ = 1000  # higher than ndarray so ops dispatch to us
@@ -407,6 +409,11 @@ def nbytes(self):
     def isna(self):
         return self._mask
 
+    @property
+    def flags(self):
+        # compat
+        return self._data.flags
+
     @property
     def _na_value(self):
         return np.nan
@@ -559,6 +566,7 @@ def cmp_method(self, other):
             else:
                 mask = self._mask | mask
 
+            mask = np.array(mask, copy=False)
             result[mask] = op_name == 'ne'
             return result
 

diff --git a/pandas/core/arrays/mask/__init__.py b/pandas/core/arrays/mask/__init__.py
@@ -0,0 +1,30 @@
+_MaskArrayType = None
+
+
+def get_mask_array_type():
+    """Set the mask array type to use, we need to do
+    this after all modules are imported as the implementations
+    e.g. pyarrow depend on pandas being importable
+    """
+    global _MaskArrayType
+
+    if _MaskArrayType is not None:
+        return _MaskArrayType
+
+    # if ArrowBoolArray is available use it
+    # otherwise use the NumpyMask
+    try:
+        from pandas.core.arrays.mask._pyarrow import ArrowMaskArray
+
+        MaskArray = ArrowMaskArray
+
+    except ImportError:
+        from pandas.core.arrays.mask._numpy import NumpyMaskArray
+
+        MaskArray = NumpyMaskArray
+
+    _MaskArrayType = MaskArray
+    return _MaskArrayType
+
+
+__all__ = ['get_mask_array_type']
diff --git a/pandas/core/arrays/mask/_base.py b/pandas/core/arrays/mask/_base.py
@@ -0,0 +1,140 @@
+"""A boolean mask interface.
+
+This module provides an interface to a numpy / pyarrow boolean mask.
+This is limited as not all of the implementations can hold NA, so
+for consistency this is an internal.
+"""
+
+import copy
+
+import numpy as np
+
+from pandas.api.extensions import ExtensionDtype
+from pandas.api.types import is_scalar
+from pandas.core.arrays.base import ExtensionArray
+from pandas.core.missing import isna
+
+
+class MaskDtype(ExtensionDtype):
+
+    type = np.bool_
+    kind = 'b'
+    name = 'bool'
+
+    @classmethod
+    def construct_from_string(cls, string):
+        if string == cls.name:
+            return cls()
+        else:
+            raise TypeError("Cannot construct a '{}' from "
+                            "'{}'".format(cls, string))
+
+    def _is_boolean(self):
+        return True
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        # compare == to np.dtype('bool')
+        if isinstance(other, str):
+            return other == self.name
+        elif isinstance(other, type(self)):
+            return True
+        elif isinstance(other, np.dtype):
+            return other == 'bool'
+        else:
+            return hash(self) == hash(other)
+
+
+class MaskArray(ExtensionArray):
+    """Common baseclass for both pyarrow and numpy masked arrays"""
+    _typ = "maskarray"
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return cls.from_scalars(scalars)
+
+    @property
+    def size(self):
+        return len(self)
+
+    def __eq__(self, other):
+        return np.array(self, copy=False) == np.array(other, copy=False)
+
+    def __len__(self):
+        return len(self._data)
+
+    def isna(self):
+        nas = isna(np.array(self._data, copy=False))
+        return type(self).from_scalars(nas)
+
+    def __invert__(self):
+        return type(self).from_scalars(
+            ~np.array(self._data, copy=False)
+        )
+
+    def __or__(self, other):
+        return type(self).from_scalars(np.array(
+            self, copy=False).__or__(np.array(other, copy=False)))
+
+    def __ior__(self, other):
+        return type(self).from_scalars(
+            np.array(self, copy=False) | np.array(other, copy=False))
+
+    def __and__(self, other):
+        return type(self).from_scalars(
+            np.array(self, copy=False).__and__(np.array(other, copy=False)))
+
+    def __iand__(self, other):
+        return type(self).from_scalars(
+            np.array(self, copy=False) & (np.array(other, copy=False)))
+
+    def __getitem__(self, item):
+        arr = np.array(self, copy=False)
+        if is_scalar(item):
+            return arr[item]
+        else:
+            arr = arr[item]
+            return type(self).from_scalars(arr)
+
+    def view(self, dtype=None):
+        arr = np.array(self._data, copy=False)
+        if dtype is not None:
+            arr = arr.view(dtype=dtype)
+        return arr
+
+    def sum(self, axis=None, min_count=None):
+        return np.array(self, copy=False).sum()
+
+    def copy(self, deep=False):
+        if deep:
+            return type(self)(copy.deepcopy(self._data))
+        else:
+            return type(self)(copy.copy(self._data))
+
+    def any(self, axis=0, out=None):
+        return np.array(self._data, copy=False).any()
+
+    def all(self, axis=0, out=None):
+        return np.array(self._data, copy=False).all()
+
+    def min(self, axis=0, out=None):
+        return np.array(self._data, copy=False).min()
+
+    def max(self, axis=0, out=None):
+        return np.array(self._data, copy=False).max()
+
+    def _reduce(self, method, skipna=True, **kwargs):
+        if skipna:
+            arr = self[~self.isna()]
+        else:
+            arr = self
+        # we only allow explicity defined methods
+        # ndarrays actually support: mean, var, prod, min, max
+        try:
+            op = getattr(arr, method)
+            return op()
+        except AttributeError:
+            pass
+        raise TypeError
diff --git a/pandas/core/arrays/mask/_numpy.py b/pandas/core/arrays/mask/_numpy.py
@@ -0,0 +1,82 @@
+"""
+This module provide a numpy-boolean boolean array
+"""
+
+import numpy as np
+
+from pandas.api.extensions import take
+from pandas.core.arrays.mask._base import MaskArray, MaskDtype
+
+
+class NumpyMaskDtype(MaskDtype):
+
+    na_value = np.nan
+
+    @classmethod
+    def construct_array_type(cls):
+        return NumpyMaskArray
+
+
+class NumpyMaskArray(MaskArray):
+    """Generic class which can be used to represent missing data.
+    """
+
+    dtype = NumpyMaskDtype()
+
+    @classmethod
+    def from_scalars(cls, values):
+        arr = np.asarray(values).astype(np.bool_, copy=False)
+        return cls(arr, copy=False)
+
+    def __init__(self, mask, copy=True):
+        """
+        Parameters
+        ----------
+        mask : numpy array
+            Mask of missing values.
+        """
+        assert isinstance(mask, np.ndarray)
+        assert mask.dtype == np.bool_
+
+        if copy:
+            mask = mask.copy()
+        self._data = mask
+
+    def __setitem__(self, key, value):
+        self._data[key] = value
+
+    def __array__(self, dtype=None):
+        return self._data
+
+    def __iter__(self):
+        return iter(self._data)
+
+    @property
+    def nbytes(self):
+        return self._data.nbytes
+
+    def reshape(self, shape, **kwargs):
+        return np.array(self, copy=False).reshape(shape, **kwargs)
+
+    def astype(self, dtype, copy=True):
+        # needed to fix this astype for the Series constructor.
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        return super(NumpyMaskArray, self).astype(dtype, copy)
+
+    def take(self, indices, allow_fill=False, fill_value=None, axis=None):
+        # TODO: had to add axis here
+        data = self._data
+
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+
+        result = take(data, indices, fill_value=fill_value,
+                      allow_fill=allow_fill)
+        return self._from_sequence(result, dtype=self.dtype)
+
+    def _concat_same_type(cls, to_concat):
+        concat = np.concatenate(to_concat)
+        return cls.from_scalars(concat)