Skip to content

Commit d030c57

Browse files
committed
ENH: create NumpyBoolArray and ArrawBoolArray subclasses for MaskArray
1 parent 7b4b60f commit d030c57

29 files changed

+767
-402
lines changed

pandas/conftest.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,33 @@ def all_arithmetic_operators(request):
149149
return request.param
150150

151151

152-
_all_numeric_reductions = ['sum', 'max', 'min',
153-
'mean', 'prod', 'std', 'var', 'median',
154-
'kurt', 'skew']
152+
# reductions that are generally applicable to all data types
153+
_non_numeric_reductions = ['min', 'max', 'sum']
155154

155+
# reductions that are generally application to
156+
# only numeric data dtypes
157+
_numeric_reductions = ['mean', 'prod',
158+
'std', 'var', 'median',
159+
'kurt', 'skew']
156160

157-
@pytest.fixture(params=_all_numeric_reductions)
161+
162+
@pytest.fixture(params=_non_numeric_reductions)
163+
def only_non_numeric_reductions(request):
164+
"""
165+
Fixture for only non numeric reduction names
166+
"""
167+
return request.param
168+
169+
170+
@pytest.fixture(params=_numeric_reductions)
171+
def only_numeric_reductions(request):
172+
"""
173+
Fixture for only numeric reduction names
174+
"""
175+
return request.param
176+
177+
178+
@pytest.fixture(params=_non_numeric_reductions + _numeric_reductions)
158179
def all_numeric_reductions(request):
159180
"""
160181
Fixture for numeric reduction names

pandas/core/arrays/_mask.py

-103
This file was deleted.

pandas/core/arrays/integer.py

+11-23
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,8 @@
1919

2020
from pandas.core import nanops
2121
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
22-
<<<<<<< HEAD
22+
from pandas.core.arrays.mask import get_mask_array_type
2323
from pandas.core.tools.numeric import to_numeric
24-
=======
25-
from pandas.core.arrays._mask import NAMask
26-
>>>>>>> 384287e71... Revert unnecessary changes from master
2724

2825

2926
class _IntegerDtype(ExtensionDtype):
@@ -291,8 +288,7 @@ def __init__(self, values, mask, copy=False):
291288
and is_integer_dtype(values.dtype)):
292289
raise TypeError("values should be integer numpy array. Use "
293290
"the 'integer_array' function instead")
294-
if not (isinstance(mask, NAMask) or (
295-
isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype))):
291+
if not is_bool_dtype(mask):
296292
raise TypeError("mask should be boolean numpy array. Use "
297293
"the 'integer_array' function instead")
298294

@@ -301,7 +297,7 @@ def __init__(self, values, mask, copy=False):
301297
mask = mask.copy()
302298

303299
self._data = values
304-
self._mask = NAMask(mask)
300+
self._mask = get_mask_array_type()._from_sequence(mask, copy=False)
305301

306302
@classmethod
307303
def _from_sequence(cls, scalars, dtype=None, copy=False):
@@ -328,22 +324,7 @@ def __getitem__(self, item):
328324
if self._mask[item]:
329325
return self.dtype.na_value
330326
return self._data[item]
331-
<<<<<<< HEAD
332327
return type(self)(self._data[item], self._mask[item])
333-
=======
334-
335-
<<<<<<< HEAD
336-
return type(self)(self._data[item],
337-
<<<<<<< HEAD
338-
mask=_bitarray_to_numpy(self._mask)[item],
339-
dtype=self.dtype)
340-
>>>>>>> 2ff4b0907... First pass at implementation (needs refactor)
341-
=======
342-
mask=_bitarray_to_numpy(self._mask)[item])
343-
>>>>>>> e085674ac... Reverted changes; created new module for mask
344-
=======
345-
return type(self)(self._data[item], self._mask[item])
346-
>>>>>>> 384287e71... Revert unnecessary changes from master
347328

348329
def _coerce_to_ndarray(self):
349330
"""
@@ -352,7 +333,8 @@ def _coerce_to_ndarray(self):
352333

353334
# TODO(jreback) make this better
354335
data = self._data.astype(object)
355-
data[self._mask] = self._na_value
336+
mask = np.array(self._mask, copy=False)
337+
data[mask] = self._na_value
356338
return data
357339

358340
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
@@ -427,6 +409,11 @@ def nbytes(self):
427409
def isna(self):
428410
return self._mask
429411

412+
@property
413+
def flags(self):
414+
# compat
415+
return self._data.flags
416+
430417
@property
431418
def _na_value(self):
432419
return np.nan
@@ -579,6 +566,7 @@ def cmp_method(self, other):
579566
else:
580567
mask = self._mask | mask
581568

569+
mask = np.array(mask, copy=False)
582570
result[mask] = op_name == 'ne'
583571
return result
584572

pandas/core/arrays/mask/__init__.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
_MaskArrayType = None
2+
3+
4+
def get_mask_array_type():
5+
"""Set the mask array type to use, we need to do
6+
this after all modules are imported as the implementations
7+
e.g. pyarrow depend on pandas being importable
8+
"""
9+
global _MaskArrayType
10+
11+
if _MaskArrayType is not None:
12+
return _MaskArrayType
13+
14+
# if ArrowBoolArray is available use it
15+
# otherwise use the NumpyMask
16+
try:
17+
from pandas.core.arrays.mask._pyarrow import ArrowMaskArray
18+
19+
MaskArray = ArrowMaskArray
20+
21+
except ImportError:
22+
from pandas.core.arrays.mask._numpy import NumpyMaskArray
23+
24+
MaskArray = NumpyMaskArray
25+
26+
_MaskArrayType = MaskArray
27+
return _MaskArrayType
28+
29+
30+
__all__ = ['get_mask_array_type']

pandas/core/arrays/mask/_base.py

+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""A boolean mask interface.
2+
3+
This module provides an interface to a numpy / pyarrow boolean mask.
4+
This is limited as not all of the implementations can hold NA, so
5+
for consistency this is an internal.
6+
"""
7+
8+
import copy
9+
10+
import numpy as np
11+
12+
from pandas.api.extensions import ExtensionDtype
13+
from pandas.api.types import is_scalar
14+
from pandas.core.arrays.base import ExtensionArray
15+
from pandas.core.missing import isna
16+
17+
18+
class MaskDtype(ExtensionDtype):
19+
20+
type = np.bool_
21+
kind = 'b'
22+
name = 'bool'
23+
24+
@classmethod
25+
def construct_from_string(cls, string):
26+
if string == cls.name:
27+
return cls()
28+
else:
29+
raise TypeError("Cannot construct a '{}' from "
30+
"'{}'".format(cls, string))
31+
32+
def _is_boolean(self):
33+
return True
34+
35+
def __hash__(self):
36+
return hash(str(self))
37+
38+
def __eq__(self, other):
39+
# compare == to np.dtype('bool')
40+
if isinstance(other, str):
41+
return other == self.name
42+
elif isinstance(other, type(self)):
43+
return True
44+
elif isinstance(other, np.dtype):
45+
return other == 'bool'
46+
else:
47+
return hash(self) == hash(other)
48+
49+
50+
class MaskArray(ExtensionArray):
51+
"""Common baseclass for both pyarrow and numpy masked arrays"""
52+
_typ = "maskarray"
53+
54+
@classmethod
55+
def _from_sequence(cls, scalars, dtype=None, copy=False):
56+
return cls.from_scalars(scalars)
57+
58+
@property
59+
def size(self):
60+
return len(self)
61+
62+
def __eq__(self, other):
63+
return np.array(self, copy=False) == np.array(other, copy=False)
64+
65+
def __len__(self):
66+
return len(self._data)
67+
68+
def isna(self):
69+
nas = isna(np.array(self._data, copy=False))
70+
return type(self).from_scalars(nas)
71+
72+
def __invert__(self):
73+
return type(self).from_scalars(
74+
~np.array(self._data, copy=False)
75+
)
76+
77+
def __or__(self, other):
78+
return type(self).from_scalars(np.array(
79+
self, copy=False).__or__(np.array(other, copy=False)))
80+
81+
def __ior__(self, other):
82+
return type(self).from_scalars(
83+
np.array(self, copy=False) | np.array(other, copy=False))
84+
85+
def __and__(self, other):
86+
return type(self).from_scalars(
87+
np.array(self, copy=False).__and__(np.array(other, copy=False)))
88+
89+
def __iand__(self, other):
90+
return type(self).from_scalars(
91+
np.array(self, copy=False) & (np.array(other, copy=False)))
92+
93+
def __getitem__(self, item):
94+
arr = np.array(self, copy=False)
95+
if is_scalar(item):
96+
return arr[item]
97+
else:
98+
arr = arr[item]
99+
return type(self).from_scalars(arr)
100+
101+
def view(self, dtype=None):
102+
arr = np.array(self._data, copy=False)
103+
if dtype is not None:
104+
arr = arr.view(dtype=dtype)
105+
return arr
106+
107+
def sum(self, axis=None, min_count=None):
108+
return np.array(self, copy=False).sum()
109+
110+
def copy(self, deep=False):
111+
if deep:
112+
return type(self)(copy.deepcopy(self._data))
113+
else:
114+
return type(self)(copy.copy(self._data))
115+
116+
def any(self, axis=0, out=None):
117+
return np.array(self._data, copy=False).any()
118+
119+
def all(self, axis=0, out=None):
120+
return np.array(self._data, copy=False).all()
121+
122+
def min(self, axis=0, out=None):
123+
return np.array(self._data, copy=False).min()
124+
125+
def max(self, axis=0, out=None):
126+
return np.array(self._data, copy=False).max()
127+
128+
def _reduce(self, method, skipna=True, **kwargs):
129+
if skipna:
130+
arr = self[~self.isna()]
131+
else:
132+
arr = self
133+
# we only allow explicity defined methods
134+
# ndarrays actually support: mean, var, prod, min, max
135+
try:
136+
op = getattr(arr, method)
137+
return op()
138+
except AttributeError:
139+
pass
140+
raise TypeError

0 commit comments

Comments
 (0)