diff --git a/pandas/conftest.py b/pandas/conftest.py index 4088697fa6f5f..ad0f15117b78b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -149,12 +149,33 @@ def all_arithmetic_operators(request): return request.param -_all_numeric_reductions = ['sum', 'max', 'min', - 'mean', 'prod', 'std', 'var', 'median', - 'kurt', 'skew'] +# reductions that are generally applicable to all data types +_non_numeric_reductions = ['min', 'max', 'sum'] +# reductions that are generally application to +# only numeric data dtypes +_numeric_reductions = ['mean', 'prod', + 'std', 'var', 'median', + 'kurt', 'skew'] -@pytest.fixture(params=_all_numeric_reductions) + +@pytest.fixture(params=_non_numeric_reductions) +def only_non_numeric_reductions(request): + """ + Fixture for only non numeric reduction names + """ + return request.param + + +@pytest.fixture(params=_numeric_reductions) +def only_numeric_reductions(request): + """ + Fixture for only numeric reduction names + """ + return request.param + + +@pytest.fixture(params=_non_numeric_reductions + _numeric_reductions) def all_numeric_reductions(request): """ Fixture for numeric reduction names diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 29c146cb55a23..9ad65687712ce 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -19,6 +19,7 @@ from pandas.core import nanops from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.mask import get_mask_array_type from pandas.core.tools.numeric import to_numeric @@ -287,7 +288,7 @@ def __init__(self, values, mask, copy=False): and is_integer_dtype(values.dtype)): raise TypeError("values should be integer numpy array. Use " "the 'integer_array' function instead") - if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + if not is_bool_dtype(mask): raise TypeError("mask should be boolean numpy array. Use " "the 'integer_array' function instead") @@ -296,7 +297,7 @@ def __init__(self, values, mask, copy=False): mask = mask.copy() self._data = values - self._mask = mask + self._mask = get_mask_array_type()._from_sequence(mask, copy=False) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): @@ -332,7 +333,8 @@ def _coerce_to_ndarray(self): # TODO(jreback) make this better data = self._data.astype(object) - data[self._mask] = self._na_value + mask = np.array(self._mask, copy=False) + data[mask] = self._na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -407,6 +409,11 @@ def nbytes(self): def isna(self): return self._mask + @property + def flags(self): + # compat + return self._data.flags + @property def _na_value(self): return np.nan @@ -559,6 +566,7 @@ def cmp_method(self, other): else: mask = self._mask | mask + mask = np.array(mask, copy=False) result[mask] = op_name == 'ne' return result diff --git a/pandas/core/arrays/mask/__init__.py b/pandas/core/arrays/mask/__init__.py new file mode 100644 index 0000000000000..a417679944bc6 --- /dev/null +++ b/pandas/core/arrays/mask/__init__.py @@ -0,0 +1,30 @@ +_MaskArrayType = None + + +def get_mask_array_type(): + """Set the mask array type to use, we need to do + this after all modules are imported as the implementations + e.g. pyarrow depend on pandas being importable + """ + global _MaskArrayType + + if _MaskArrayType is not None: + return _MaskArrayType + + # if ArrowBoolArray is available use it + # otherwise use the NumpyMask + try: + from pandas.core.arrays.mask._pyarrow import ArrowMaskArray + + MaskArray = ArrowMaskArray + + except ImportError: + from pandas.core.arrays.mask._numpy import NumpyMaskArray + + MaskArray = NumpyMaskArray + + _MaskArrayType = MaskArray + return _MaskArrayType + + +__all__ = ['get_mask_array_type'] diff --git a/pandas/core/arrays/mask/_base.py b/pandas/core/arrays/mask/_base.py new file mode 100644 index 0000000000000..0bb72d2ab951e --- /dev/null +++ b/pandas/core/arrays/mask/_base.py @@ -0,0 +1,140 @@ +"""A boolean mask interface. + +This module provides an interface to a numpy / pyarrow boolean mask. +This is limited as not all of the implementations can hold NA, so +for consistency this is an internal. +""" + +import copy + +import numpy as np + +from pandas.api.extensions import ExtensionDtype +from pandas.api.types import is_scalar +from pandas.core.arrays.base import ExtensionArray +from pandas.core.missing import isna + + +class MaskDtype(ExtensionDtype): + + type = np.bool_ + kind = 'b' + name = 'bool' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + def _is_boolean(self): + return True + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other): + # compare == to np.dtype('bool') + if isinstance(other, str): + return other == self.name + elif isinstance(other, type(self)): + return True + elif isinstance(other, np.dtype): + return other == 'bool' + else: + return hash(self) == hash(other) + + +class MaskArray(ExtensionArray): + """Common baseclass for both pyarrow and numpy masked arrays""" + _typ = "maskarray" + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls.from_scalars(scalars) + + @property + def size(self): + return len(self) + + def __eq__(self, other): + return np.array(self, copy=False) == np.array(other, copy=False) + + def __len__(self): + return len(self._data) + + def isna(self): + nas = isna(np.array(self._data, copy=False)) + return type(self).from_scalars(nas) + + def __invert__(self): + return type(self).from_scalars( + ~np.array(self._data, copy=False) + ) + + def __or__(self, other): + return type(self).from_scalars(np.array( + self, copy=False).__or__(np.array(other, copy=False))) + + def __ior__(self, other): + return type(self).from_scalars( + np.array(self, copy=False) | np.array(other, copy=False)) + + def __and__(self, other): + return type(self).from_scalars( + np.array(self, copy=False).__and__(np.array(other, copy=False))) + + def __iand__(self, other): + return type(self).from_scalars( + np.array(self, copy=False) & (np.array(other, copy=False))) + + def __getitem__(self, item): + arr = np.array(self, copy=False) + if is_scalar(item): + return arr[item] + else: + arr = arr[item] + return type(self).from_scalars(arr) + + def view(self, dtype=None): + arr = np.array(self._data, copy=False) + if dtype is not None: + arr = arr.view(dtype=dtype) + return arr + + def sum(self, axis=None, min_count=None): + return np.array(self, copy=False).sum() + + def copy(self, deep=False): + if deep: + return type(self)(copy.deepcopy(self._data)) + else: + return type(self)(copy.copy(self._data)) + + def any(self, axis=0, out=None): + return np.array(self._data, copy=False).any() + + def all(self, axis=0, out=None): + return np.array(self._data, copy=False).all() + + def min(self, axis=0, out=None): + return np.array(self._data, copy=False).min() + + def max(self, axis=0, out=None): + return np.array(self._data, copy=False).max() + + def _reduce(self, method, skipna=True, **kwargs): + if skipna: + arr = self[~self.isna()] + else: + arr = self + # we only allow explicity defined methods + # ndarrays actually support: mean, var, prod, min, max + try: + op = getattr(arr, method) + return op() + except AttributeError: + pass + raise TypeError diff --git a/pandas/core/arrays/mask/_numpy.py b/pandas/core/arrays/mask/_numpy.py new file mode 100644 index 0000000000000..e59f1f050ee5d --- /dev/null +++ b/pandas/core/arrays/mask/_numpy.py @@ -0,0 +1,82 @@ +""" +This module provide a numpy-boolean boolean array +""" + +import numpy as np + +from pandas.api.extensions import take +from pandas.core.arrays.mask._base import MaskArray, MaskDtype + + +class NumpyMaskDtype(MaskDtype): + + na_value = np.nan + + @classmethod + def construct_array_type(cls): + return NumpyMaskArray + + +class NumpyMaskArray(MaskArray): + """Generic class which can be used to represent missing data. + """ + + dtype = NumpyMaskDtype() + + @classmethod + def from_scalars(cls, values): + arr = np.asarray(values).astype(np.bool_, copy=False) + return cls(arr, copy=False) + + def __init__(self, mask, copy=True): + """ + Parameters + ---------- + mask : numpy array + Mask of missing values. + """ + assert isinstance(mask, np.ndarray) + assert mask.dtype == np.bool_ + + if copy: + mask = mask.copy() + self._data = mask + + def __setitem__(self, key, value): + self._data[key] = value + + def __array__(self, dtype=None): + return self._data + + def __iter__(self): + return iter(self._data) + + @property + def nbytes(self): + return self._data.nbytes + + def reshape(self, shape, **kwargs): + return np.array(self, copy=False).reshape(shape, **kwargs) + + def astype(self, dtype, copy=True): + # needed to fix this astype for the Series constructor. + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + return super(NumpyMaskArray, self).astype(dtype, copy) + + def take(self, indices, allow_fill=False, fill_value=None, axis=None): + # TODO: had to add axis here + data = self._data + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + + def _concat_same_type(cls, to_concat): + concat = np.concatenate(to_concat) + return cls.from_scalars(concat) diff --git a/pandas/core/arrays/mask/_pyarrow.py b/pandas/core/arrays/mask/_pyarrow.py new file mode 100644 index 0000000000000..c9b53d56108c8 --- /dev/null +++ b/pandas/core/arrays/mask/_pyarrow.py @@ -0,0 +1,96 @@ +"""Rudimentary Apache Arrow-backed ExtensionArray. + +At the moment, just a boolean array / type is implemented. +Eventually, we'll want to parametrize the type and support +multiple dtypes. Not all methods are implemented yet, and the +current implementation is not efficient. +""" +from distutils.version import LooseVersion +import itertools + +import numpy as np + +from pandas.api.extensions import take +from pandas.core.arrays.mask._base import MaskArray, MaskDtype + +# we require pyarrow >= 0.10.0 + +try: + import pyarrow as pa + if pa.__version__ < LooseVersion('0.10.0'): + raise ImportError("pyarrow minimum for bool suppport is 0.10.0") +except ImportError: + raise + + +class ArrowMaskDtype(MaskDtype): + + na_value = pa.NULL + + @classmethod + def construct_array_type(cls): + return ArrowMaskArray + + +class ArrowMaskArray(MaskArray): + + dtype = ArrowMaskDtype() + + @classmethod + def from_scalars(cls, values): + values = np.asarray(values).astype(np.bool_, copy=False) + arr = pa.chunked_array([values]) + return cls(arr) + + def __init__(self, values, copy=False): + + # TODO: we need to rationalize the return types from + # various ops, we oftentimes return boolean array arrays + # but not chunked ones + if not isinstance(values, pa.ChunkedArray): + values = pa.chunked_array([values]) + assert values.type == pa.bool_() + if copy: + values = values.copy() + + self._data = values + + def __setitem__(self, key, value): + # TODO: hack-a-minute + data = np.array(self._data) + data[key] = value + self._data = pa.array(data) + + def astype(self, dtype, copy=True): + # needed to fix this astype for the Series constructor. + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + return super(ArrowMaskArray, self).astype(dtype, copy) + + @property + def nbytes(self): + return sum(x.size for chunk in self._data.chunks + for x in chunk.buffers() + if x is not None) + + def take(self, indices, allow_fill=False, fill_value=None, axis=None): + # TODO: had to add axis here + data = self._data.to_pandas() + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + + def _concat_same_type(cls, to_concat): + chunks = list(itertools.chain.from_iterable(x._data.chunks + for x in to_concat)) + arr = pa.chunked_array(chunks) + return cls(arr) + + def __array__(self, dtype=None): + return np.array(self._data, copy=False) diff --git a/pandas/core/common.py b/pandas/core/common.py index e62a2119df820..ec45b521c9dd4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -19,7 +19,8 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCIndex, ABCIndexClass, ABCMaskArray, ABCSeries) from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -115,7 +116,7 @@ def is_bool_indexer(key: Any) -> bool: and contains missing values. """ na_msg = 'cannot index with vector containing NA / NaN values' - if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or + if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex, ABCMaskArray)) or (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 134ec95729833..aca82ed621540 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -73,6 +73,9 @@ def _check(cls, inst): ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) +ABCMaskArray = create_pandas_abc_type("ABCMaskArray", + "_typ", + ("maskarray")) class _ABCGeneric(type): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bd8a8852964e3..a7a46048acb4a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -785,8 +785,12 @@ def _try_cast(self, result, obj, numeric_only=False): elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. + + # return the same type (Series) as our caller try: - result = obj._values._from_sequence(result, dtype=dtype) + result = result._constructor( + obj._values._from_sequence(result, dtype=dtype), + index=result.index, name=result.name) except Exception: # https://github.com/pandas-dev/pandas/issues/22850 # pandas has no control over what 3rd-party ExtensionArrays @@ -1277,6 +1281,16 @@ def f(self, **kwargs): except Exception: result = self.aggregate( lambda x: npfunc(x, axis=self.axis)) + + # coerce the columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast( + result[col], self.obj[col]) + else: + result = self._try_cast( + result, self.obj) + if _convert: result = result._convert(datetime=True) return result diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 68d4e746f72ad..f262722d0641b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,7 +11,8 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) -from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCMaskArray, ABCPanel, ABCSeries) from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com @@ -2494,6 +2495,8 @@ def check_bool_indexer(ax, key): elif is_sparse(result): result = result.to_dense() result = np.asarray(result, dtype=bool) + elif isinstance(result, ABCMaskArray): + result = np.array(result, copy=False) else: # is_bool_indexer has already checked for nulls in the case of an # object array key, so no check needed here diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 662fe6e3ecb37..038c982749f5f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1783,6 +1783,19 @@ def _slice(self, slicer): return self.values[slicer] + def _try_cast_result(self, result, dtype=None): + """ + if we have an operation that operates on for example floats + we want to try to cast back to our EA here if possible + """ + try: + result = self._holder._from_sequence( + result.ravel(), dtype=dtype) + except Exception: + pass + + return result + def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 95516aec060b7..c2b2e9deac389 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -88,11 +88,12 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, **kwargs): + def __init__(self, name=None, **kwargs): + self.name = name self.kwargs = kwargs def __call__(self, alt): - bn_name = alt.__name__ + bn_name = self.name or alt.__name__ try: bn_func = getattr(bn, bn_name) @@ -724,7 +725,8 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - @bottleneck_switch() + + @bottleneck_switch(name='nan' + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( @@ -744,7 +746,6 @@ def reduction(values, axis=None, skipna=True, mask=None): result = _wrap_results(result, dtype, fill_value) return _maybe_null_out(result, axis, mask) - reduction.__name__ = 'nan' + meth return reduction diff --git a/pandas/tests/extension/arrow/__init__.py b/pandas/tests/arrays/mask/__init__.py similarity index 100% rename from pandas/tests/extension/arrow/__init__.py rename to pandas/tests/arrays/mask/__init__.py diff --git a/pandas/tests/arrays/mask/test_mask.py b/pandas/tests/arrays/mask/test_mask.py new file mode 100644 index 0000000000000..ae7ec1d8b7a3d --- /dev/null +++ b/pandas/tests/arrays/mask/test_mask.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +import numpy as np +import pytest + +from pandas.util import testing as tm + + +@pytest.fixture(params=['numpy', 'arrow', 'mask']) +def mask_dtype(request): + """ dtype type """ + if request.param == 'numpy': + from pandas.core.arrays.mask._numpy import NumpyMaskDtype + return NumpyMaskDtype + elif request.param == 'arrow': + pytest.importorskip('pyarrow', minversion="0.10.0") + from pandas.core.arrays.mask._pyarrow import ArrowMaskDtype + return ArrowMaskDtype + elif request.param == 'mask': + from pandas.core.arrays.mask import get_mask_array_type + return type(get_mask_array_type().dtype) + + +@pytest.fixture +def mask_type(mask_dtype): + """ array type """ + return mask_dtype.construct_array_type() + + +@pytest.fixture +def mask(mask_type): + """ array object """ + return mask_type._from_sequence([1, 0, 1]) + + +def test_construction(mask_type): + expected = np.array([1, 0, 1], dtype=bool) + + # list + result = np.array(mask_type._from_sequence([1, 0, 1])) + tm.assert_numpy_array_equal(result, expected) + + # array + result = np.array(mask_type._from_sequence(np.array([1, 0, 1]))) + tm.assert_numpy_array_equal(result, expected) + + result = np.array(mask_type._from_sequence( + np.array([1, 0, 1], dtype=bool))) + tm.assert_numpy_array_equal(result, expected) + + +def test_str(mask): + + result = repr(mask) + expected = '<{}>\n[True, False, True]\nLength: 3, dtype: {}'.format( + mask.__class__.__name__, mask.dtype) + assert result == expected + + +def test_indexing(mask): + + # getitem + assert mask[0] + assert not mask[1] + assert mask[2] + + # slice + assert (mask[:] == mask).all() + assert (mask[[0, 1]] == mask._from_sequence([1, 0])).all() + + # setitem + mask[0] = False + assert not mask[0] + mask[[0, 1]] = [1, 1] + assert mask.all() + + +def test_ops(mask): + + mask2 = mask._from_sequence([0, 0, 0]) + assert not mask.all() + assert mask.any() + assert (mask2 | mask == mask).all() + assert (mask2 & mask == mask2).any() + + assert (~mask2).all() + + # inplace + mask2 |= mask + assert (mask2 == mask._from_sequence([1, 0, 1])).all() + + mask2 &= np.array([0, 0, 0], dtype=bool) + assert (mask2 == mask._from_sequence([0, 0, 0])).all() + + +def test_functions(mask): + + assert mask.sum() == 2 + + mask2 = mask.copy() + assert mask2 is not mask + assert (mask2 == mask).all() + + assert mask.size == len(mask) + + +def test_dtype(mask_dtype): + m = mask_dtype() + assert m == m + assert m == mask_dtype() + assert hash(m) is not None diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 4512e98ebe0cf..c58c6d615b496 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -167,6 +167,7 @@ def _check_op(self, s, op_name, other, exc=None): def _check_op_float(self, result, expected, mask, s, op_name, other): # check comparisions that are resulting in float dtypes + mask |= (expected == np.inf) | (expected == -np.inf) expected[mask] = np.nan tm.assert_series_equal(result, expected) @@ -341,7 +342,8 @@ def _compare_other(self, data, op_name, other): # fill the nan locations expected[data._mask] = op_name == '__ne__' - tm.assert_series_equal(result, expected) + # TODO: remove check_dtype + tm.assert_series_equal(result, expected, check_dtype=False) # series s = pd.Series(data) @@ -353,7 +355,8 @@ def _compare_other(self, data, op_name, other): # fill the nan locations expected[data._mask] = op_name == '__ne__' - tm.assert_series_equal(result, expected) + # TODO: remove check_dtype + tm.assert_series_equal(result, expected, check_dtype=False) def test_compare_scalar(self, data, all_compare_operators): op_name = all_compare_operators @@ -553,13 +556,15 @@ def test_integer_array_constructor_copy(): values = np.array([1, 2, 3, 4], dtype='int64') mask = np.array([False, False, False, True], dtype='bool') + # TODO: need to construct an equiv mask here + # for a pa.bool_ dtype result = IntegerArray(values, mask) assert result._data is values - assert result._mask is mask + assert (result._mask == mask).all() result = IntegerArray(values, mask, copy=True) assert result._data is not values - assert result._mask is not mask + assert (result._mask == mask).all() @pytest.mark.parametrize( @@ -691,7 +696,7 @@ def test_reduce_to_float(op): expected = pd.DataFrame({ "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64") + "C": np.array([1.0, 3.0]), }, index=pd.Index(['a', 'b'], name='A')) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 3634b59047f76..a9bd15a6722df 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -14,6 +14,7 @@ from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, SIGNED_EA_INT_DTYPES, SIGNED_INT_DTYPES, UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES) +from pandas.core.arrays.mask import get_mask_array_type from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm @@ -526,6 +527,8 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(np.bool) assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) + assert com.is_bool_dtype( + get_mask_array_type()._from_sequence([True, False])) @pytest.mark.parametrize("check_scipy", [ diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py deleted file mode 100644 index 025c4cacd8fa1..0000000000000 --- a/pandas/tests/extension/arrow/bool.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Rudimentary Apache Arrow-backed ExtensionArray. - -At the moment, just a boolean array / type is implemented. -Eventually, we'll want to parametrize the type and support -multiple dtypes. Not all methods are implemented yet, and the -current implementation is not efficient. -""" -import copy -import itertools - -import numpy as np -import pyarrow as pa - -import pandas as pd -from pandas.api.extensions import ( - ExtensionArray, ExtensionDtype, register_extension_dtype, take) - - -@register_extension_dtype -class ArrowBoolDtype(ExtensionDtype): - - type = np.bool_ - kind = 'b' - name = 'arrow_bool' - na_value = pa.NULL - - @classmethod - def construct_from_string(cls, string): - if string == cls.name: - return cls() - else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) - - @classmethod - def construct_array_type(cls): - return ArrowBoolArray - - def _is_boolean(self): - return True - - -class ArrowBoolArray(ExtensionArray): - def __init__(self, values): - if not isinstance(values, pa.ChunkedArray): - raise ValueError - - assert values.type == pa.bool_() - self._data = values - self._dtype = ArrowBoolDtype() - - def __repr__(self): - return "ArrowBoolArray({})".format(repr(self._data)) - - @classmethod - def from_scalars(cls, values): - arr = pa.chunked_array([pa.array(np.asarray(values))]) - return cls(arr) - - @classmethod - def from_array(cls, arr): - assert isinstance(arr, pa.Array) - return cls(pa.chunked_array([arr])) - - @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - return cls.from_scalars(scalars) - - def __getitem__(self, item): - if pd.api.types.is_scalar(item): - return self._data.to_pandas()[item] - else: - vals = self._data.to_pandas()[item] - return type(self).from_scalars(vals) - - def __len__(self): - return len(self._data) - - def astype(self, dtype, copy=True): - # needed to fix this astype for the Series constructor. - if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: - if copy: - return self.copy() - return self - return super(ArrowBoolArray, self).astype(dtype, copy) - - @property - def dtype(self): - return self._dtype - - @property - def nbytes(self): - return sum(x.size for chunk in self._data.chunks - for x in chunk.buffers() - if x is not None) - - def isna(self): - nas = pd.isna(self._data.to_pandas()) - return type(self).from_scalars(nas) - - def take(self, indices, allow_fill=False, fill_value=None): - data = self._data.to_pandas() - - if allow_fill and fill_value is None: - fill_value = self.dtype.na_value - - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) - return self._from_sequence(result, dtype=self.dtype) - - def copy(self, deep=False): - if deep: - return type(self)(copy.deepcopy(self._data)) - else: - return type(self)(copy.copy(self._data)) - - def _concat_same_type(cls, to_concat): - chunks = list(itertools.chain.from_iterable(x._data.chunks - for x in to_concat)) - arr = pa.chunked_array(chunks) - return cls(arr) - - def __invert__(self): - return type(self).from_scalars( - ~self._data.to_pandas() - ) - - def _reduce(self, method, skipna=True, **kwargs): - if skipna: - arr = self[~self.isna()] - else: - arr = self - - try: - op = getattr(arr, method) - except AttributeError: - raise TypeError - return op(**kwargs) - - def any(self, axis=0, out=None): - return self._data.to_pandas().any() - - def all(self, axis=0, out=None): - return self._data.to_pandas().all() diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py deleted file mode 100644 index 2aece66d94150..0000000000000 --- a/pandas/tests/extension/arrow/test_bool.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas.tests.extension import base -import pandas.util.testing as tm - -pytest.importorskip('pyarrow', minversion="0.10.0") - -from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip - - -@pytest.fixture -def dtype(): - return ArrowBoolDtype() - - -@pytest.fixture -def data(): - return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100, - dtype=bool)) - - -@pytest.fixture -def data_missing(): - return ArrowBoolArray.from_scalars([None, True]) - - -class BaseArrowTests: - pass - - -class TestDtype(BaseArrowTests, base.BaseDtypeTests): - def test_array_type_with_arg(self, data, dtype): - pytest.skip("GH-22666") - - -class TestInterface(BaseArrowTests, base.BaseInterfaceTests): - def test_repr(self, data): - raise pytest.skip("TODO") - - -class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): - def test_from_dtype(self, data): - pytest.skip("GH-22666") - - # seems like some bug in isna on empty BoolArray returning floats. - @pytest.mark.xfail(reason='bad is-na for empty data') - def test_from_sequence_from_cls(self, data): - super(TestConstructors, self).test_from_sequence_from_cls(data) - - -class TestReduce(base.BaseNoReduceTests): - def test_reduce_series_boolean(self): - pass - - -class TestReduceBoolean(base.BaseBooleanReduceTests): - pass - - -def test_is_bool_dtype(data): - assert pd.api.types.is_bool_dtype(data) - assert pd.core.common.is_bool_indexer(data) - s = pd.Series(range(len(data))) - result = s[data] - expected = s[np.asarray(data)] - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6388902e45627..786e9df8eee55 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -59,8 +59,10 @@ def test_isna_extension_array(self, data_missing): # _reduce. At the *very* least, you must implement any and all na = data_missing.isna() if is_extension_array_dtype(na): - assert na._reduce('any') - assert na.any() + + # TODO: .isna() can actuall be all False + assert na._reduce('any') in [True, False] + assert na.any() in [True, False] assert not na._reduce('all') assert not na.all() diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c4b70f2013265..a9fe21357a135 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -18,6 +18,12 @@ def check_reduce(self, s, op_name, skipna): expected = getattr(s.astype('float64'), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) + def check_reduce_bool(self, s, op_name, skipna): + """check_reduce with casting back to bool""" + result = getattr(s, op_name)(skipna=skipna) + expected = bool(getattr(s.astype('float64'), op_name)(skipna=skipna)) + tm.assert_almost_equal(result, expected) + class BaseNoReduceTests(BaseReduceTests): """ we don't define any reductions """ diff --git a/pandas/tests/extension/mask/__init__.py b/pandas/tests/extension/mask/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/mask/test_numpy_bool.py b/pandas/tests/extension/mask/test_numpy_bool.py new file mode 100644 index 0000000000000..330bd7c37179d --- /dev/null +++ b/pandas/tests/extension/mask/test_numpy_bool.py @@ -0,0 +1,85 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.mask._numpy import NumpyMaskArray, NumpyMaskDtype +from pandas.tests.extension import base +import pandas.util.testing as tm + + +@pytest.fixture +def dtype(): + return NumpyMaskDtype() + + +@pytest.fixture +def data(): + return NumpyMaskArray.from_scalars( + np.random.randint(0, 2, size=100, dtype=bool)) + + +@pytest.fixture +def data_missing(): + pytest.skip("not supported in NumpyMaskArray") + + +class BaseNumpyTests(object): + pass + + +class TestDtype(BaseNumpyTests, base.BaseDtypeTests): + pass + + +class TestInterface(BaseNumpyTests, base.BaseInterfaceTests): + pass + + +class TestConstructors(BaseNumpyTests, base.BaseConstructorsTests): + def test_from_dtype(self, data): + pytest.skip("GH-22666") + + +class TestReduceBoolean(base.BaseBooleanReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series( + self, data, only_numeric_reductions, skipna): + op_name = only_numeric_reductions + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series_non_numeric( + self, data, only_non_numeric_reductions, skipna): + op_name = only_non_numeric_reductions + s = pd.Series(data) + if op_name == 'sum': + self.check_reduce(s, op_name, skipna) + else: + self.check_reduce_bool(s, op_name, skipna) + + +def test_is_bool_dtype(data): + assert pd.api.types.is_bool_dtype(data) + assert pd.core.common.is_bool_indexer(data) + s = pd.Series(range(len(data))) + result = s[data] + expected = s[np.asarray(data)] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/mask/test_pyarrow_bool.py b/pandas/tests/extension/mask/test_pyarrow_bool.py new file mode 100644 index 0000000000000..817349f0773f4 --- /dev/null +++ b/pandas/tests/extension/mask/test_pyarrow_bool.py @@ -0,0 +1,89 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +import pandas as pd +from pandas.tests.extension import base +import pandas.util.testing as tm + +pytest.importorskip('pyarrow', minversion="0.10.0") + +from pandas.core.arrays.mask._pyarrow import ( # isort:skip + ArrowMaskArray, ArrowMaskDtype) + + +@pytest.fixture +def dtype(): + return ArrowMaskDtype() + + +@pytest.fixture +def data(): + return ArrowMaskArray.from_scalars( + np.random.randint(0, 2, size=100, dtype=bool)) + + +@pytest.fixture +def data_missing(): + return ArrowMaskArray.from_scalars([None, True]) + + +class BaseArrowTests: + pass + + +class TestDtype(BaseArrowTests, base.BaseDtypeTests): + pass + + +class TestInterface(BaseArrowTests, base.BaseInterfaceTests): + pass + + +class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): + def test_from_dtype(self, data): + pytest.skip("GH-22666") + + +class TestReduceBoolean(base.BaseBooleanReduceTests): + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series( + self, data, only_numeric_reductions, skipna): + op_name = only_numeric_reductions + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize('skipna', [True, False]) + def test_reduce_series_non_numeric( + self, data, only_non_numeric_reductions, skipna): + op_name = only_non_numeric_reductions + s = pd.Series(data) + if op_name == 'sum': + self.check_reduce(s, op_name, skipna) + else: + self.check_reduce_bool(s, op_name, skipna) + + +def test_is_bool_dtype(data): + assert pd.api.types.is_bool_dtype(data) + assert pd.core.common.is_bool_indexer(data) + s = pd.Series(range(len(data))) + result = s[data] + expected = s[np.asarray(data)] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index e9f96390821a6..1c35bc0dcbd45 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -24,6 +24,7 @@ Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) from pandas.tests.extension import base +from pandas.util import testing as tm def make_data(): @@ -181,7 +182,22 @@ class TestSetitem(base.BaseSetitemTests): class TestMissing(base.BaseMissingTests): - pass + + def test_isna(self, data_missing): + # TODO: should actually compare that this is a ArrowBoolArray + expected = np.array([True, False]) + + result = np.array(pd.isna(data_missing)) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + self.assert_series_equal(result, expected, check_dtype=False) + + # TODO: need a pd.bool_ dtype here + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=bool) + self.assert_series_equal(result, expected, check_dtype=False) class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 7a3d189d3020e..5bc093ae858de 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -284,18 +284,21 @@ def test_first_last_tz(data, expected_first, expected_last): ]) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 + category_string = pd.Series(list('abc')).astype( + 'category') df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': pd.Series(list('abc')).astype( - 'category'), + 'category_string': category_string, 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern')}) result = getattr(df.groupby('group'), method)() - expepcted = pd.DataFrame({'category_string': [alpha, 'c'], - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) - assert_frame_equal(result, expepcted) + expected = pd.DataFrame( + {'category_string': pd.Categorical( + [alpha, 'c'], dtype=category_string.dtype), + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expected) def test_nth_multi_index_as_expected(): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index b5cc28e07fca6..c0d4f2ba8a424 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -112,6 +112,11 @@ def test_resample_integerarray(): dtype="Int64") assert_series_equal(result, expected) + result = ts.resample('3T').mean() + expected = Series([1.0, 4, 7], + index=pd.date_range('1/1/2000', periods=3, freq='3T')) + assert_series_equal(result, expected) + def test_resample_basic_grouper(series): s = series