From a810bd2e4d3571289a672bf81671b8a566e97eaf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Dec 2018 12:04:57 -0600 Subject: [PATCH 01/11] Squashed keepdims --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/compat/numpy/function.py | 23 ++++++++++++++--- pandas/core/generic.py | 12 +++++++-- pandas/tests/series/test_analytics.py | 36 +++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 78f864f0dcb73..7da1c1aeef348 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1400,6 +1400,7 @@ Numeric - Added ``log10`` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`) - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) - Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`) +- Reduction methods like :meth:`Series.sum` now accept the default value of ``keepdims=False`` when called from a NumPy ufunc, rather than raising a ``TypeError``. Full support for ``keepdims`` has not been implemented (:issue:`24356`). Conversion ^^^^^^^^^^ diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 30fdeca35faf3..417ddd0d8af17 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -189,15 +189,16 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS = OrderedDict() ALLANY_DEFAULTS['dtype'] = None ALLANY_DEFAULTS['out'] = None +ALLANY_DEFAULTS['keepdims'] = False validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all', method='both', max_fname_arg_count=1) validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any', method='both', max_fname_arg_count=1) -LOGICAL_FUNC_DEFAULTS = dict(out=None) +LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') -MINMAX_DEFAULTS = dict(out=None) +MINMAX_DEFAULTS = dict(out=None, keepdims=False) validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min', method='both', max_fname_arg_count=1) validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max', @@ -225,16 +226,32 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): STAT_FUNC_DEFAULTS = OrderedDict() STAT_FUNC_DEFAULTS['dtype'] = None STAT_FUNC_DEFAULTS['out'] = None + +PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +SUM_DEFAULTS['keepdims'] = False +SUM_DEFAULTS['initial'] = None + +MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +MEDIAN_DEFAULTS['overwrite_input'] = False +MEDIAN_DEFAULTS['keepdims'] = False + +STAT_FUNC_DEFAULTS['keepdims'] = False + validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method='kwargs') -validate_sum = CompatValidator(STAT_FUNC_DEFAULTS, fname='sort', +validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum', method='both', max_fname_arg_count=1) +validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod", + method="both", max_fname_arg_count=1) validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean', method='both', max_fname_arg_count=1) +validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median', + method='both', max_fname_arg_count=1) STAT_DDOF_FUNC_DEFAULTS = OrderedDict() STAT_DDOF_FUNC_DEFAULTS['dtype'] = None STAT_DDOF_FUNC_DEFAULTS['out'] = None +STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method='kwargs') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6eb6bc124c80a..c1a53e1e97803 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10834,7 +10834,12 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs): - nv.validate_stat_func(tuple(), kwargs, fname=name) + if name == 'sum': + nv.validate_sum(tuple(), kwargs) + elif name == 'prod': + nv.validate_prod(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -10855,7 +10860,10 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f, @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - nv.validate_stat_func(tuple(), kwargs, fname=name) + if name == 'median': + nv.validate_median(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 81d60aba44b0f..0d8804dba83c1 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1641,6 +1641,42 @@ def test_value_counts_categorical_not_ordered(self): tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) + @pytest.mark.parametrize("func", [np.any, np.all]) + @pytest.mark.parametrize("kwargs", [ + dict(keepdims=True), + dict(out=object()), + ]) + @td.skip_if_np_lt_115 + def test_validate_any_all_out_keepdims_raises(self, kwargs, func): + s = pd.Series([1, 2]) + param = list(kwargs)[0] + name = func.__name__ + + msg = "the '{}' parameter .* {}".format(param, name) + with pytest.raises(ValueError, match=msg): + func(s, **kwargs) + + @td.skip_if_np_lt_115 + def test_validate_sum_initial(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, match="the 'initial' .* sum"): + np.sum(s, initial=10) + + def test_validate_median_initial(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, + match="the 'overwrite_input' .* median"): + # It seems like np.median doesn't dispatch, so we use the + # method instead of the ufunc. + s.median(overwrite_input=True) + + @td.skip_if_np_lt_115 + def test_validate_stat_keepdims(self): + s = pd.Series([1, 2]) + with pytest.raises(ValueError, + match="the 'keepdims'"): + np.sum(s, keepdims=True) + main_dtypes = [ 'datetime', From e02f3dd5374728e829d702ef991859414e7a197b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Dec 2018 12:05:22 -0600 Subject: [PATCH 02/11] API: .array is always an ExtensionArray --- doc/source/api.rst | 1 + doc/source/basics.rst | 41 ++- doc/source/dsintro.rst | 8 +- doc/source/whatsnew/v0.24.0.rst | 9 +- pandas/__init__.py | 1 + pandas/arrays/__init__.py | 11 + pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/categorical.py | 13 +- pandas/core/arrays/numpy_.py | 446 +++++++++++++++++++++++ pandas/core/base.py | 48 ++- pandas/core/dtypes/common.py | 55 ++- pandas/core/dtypes/generic.py | 4 + pandas/core/indexes/base.py | 7 +- pandas/core/internals/blocks.py | 28 +- pandas/core/internals/construction.py | 15 +- pandas/core/reshape/reshape.py | 7 +- pandas/tests/api/test_api.py | 2 +- pandas/tests/arrays/test_numpy.py | 93 +++++ pandas/tests/extension/test_numpy.py | 212 +++++++++++ pandas/tests/frame/test_constructors.py | 10 + pandas/tests/indexes/test_base.py | 6 + pandas/tests/series/test_constructors.py | 7 + pandas/tests/test_base.py | 19 +- 23 files changed, 980 insertions(+), 64 deletions(-) create mode 100644 pandas/arrays/__init__.py create mode 100644 pandas/core/arrays/numpy_.py create mode 100644 pandas/tests/arrays/test_numpy.py create mode 100644 pandas/tests/extension/test_numpy.py diff --git a/doc/source/api.rst b/doc/source/api.rst index d80c73d4a7c1c..47abeb17de1e3 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2681,6 +2681,7 @@ objects. api.extensions.register_index_accessor api.extensions.ExtensionDtype api.extensions.ExtensionArray + arrays.PandasArray .. This is to prevent warnings in the doc build. We don't want to encourage .. these methods. diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 0f6b5cd0b5e43..d18c00b3b07f6 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -71,8 +71,10 @@ the **array** property s.array s.index.array -Depending on the data type (see :ref:`basics.dtypes`), :attr:`~Series.array` -be either a NumPy array or an :ref:`ExtensionArray `. +:attr:`~Series.array` will always be an :class:`~pandas.api.extensions.ExtensionArray`. +The exact details of what an ``ExtensionArray`` is and why pandas uses them is a bit +beyond the scope of this introduction. See :ref:`basics.dtypes` for more. + If you know you need a NumPy array, use :meth:`~Series.to_numpy` or :meth:`numpy.asarray`. @@ -81,10 +83,30 @@ or :meth:`numpy.asarray`. s.to_numpy() np.asarray(s) -For Series and Indexes backed by NumPy arrays (like we have here), this will -be the same as :attr:`~Series.array`. When the Series or Index is backed by -a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` -may involve copying data and coercing values. +When the Series or Index is backed by +an :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` +may involve copying data and coercing values. See :ref:`basics.dtypes` for more. + +:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the +resulting :class:`ndarray`. For example, consider datetimes with timezones. +NumPy doesn't have a dtype to represent timezone-aware datetimes, so there +are two possibly useful representations: + +1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each + with the correct ``tz`` +2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have + been converted to UTC and the timezone discarded + +Timezones may be preserved with ``dtype=object`` + +.. ipython:: python + + ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + ser.to_numpy(dtype=object) + +Or thrown away with ``dtype='datetime64[ns]'`` + + ser.to_numpy(dtype="datetime64[ns]") :meth:`~Series.to_numpy` gives some control over the ``dtype`` of the resulting :class:`ndarray`. For example, consider datetimes with timezones. @@ -109,7 +131,7 @@ Or thrown away with ``dtype='datetime64[ns]'`` Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more complex. When your ``DataFrame`` only has a single data type for all the -columns, :attr:`DataFrame.to_numpy` will return the underlying data: +columns, :meth:`DataFrame.to_numpy` will return the underlying data: .. ipython:: python @@ -136,8 +158,9 @@ drawbacks: 1. When your Series contains an :ref:`extension type `, it's unclear whether :attr:`Series.values` returns a NumPy array or the extension array. - :attr:`Series.array` will always return the actual array backing the Series, - while :meth:`Series.to_numpy` will always return a NumPy array. + :attr:`Series.array` will always return an ``ExtensionArray``, and will never + copy data. :meth:`Series.to_numpy` will always return a NumPy array, + potentially at the cost of copying / coercing values. 2. When your DataFrame contains a mixture of data types, :attr:`DataFrame.values` may involve copying data and coercing values to a common dtype, a relatively expensive operation. :meth:`DataFrame.to_numpy`, being a method, makes it clearer that the diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index d4a83b6807fd5..424ea7370849c 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -146,11 +146,15 @@ If you need the actual array backing a ``Series``, use :attr:`Series.array`. s.array -Again, this is often a NumPy array, but may instead be a -:class:`~pandas.api.extensions.ExtensionArray`. See :ref:`basics.dtypes` for more. Accessing the array can be useful when you need to do some operation without the index (to disable :ref:`automatic alignment `, for example). +:attr:`Series.array` will always be an :class:`~pandas.api.extensions.ExtensionArray`. +Briefly, an ExtensionArray is a thin wrapper around one or more *concrete* arrays like a +:class:`numpy.ndarray`. Pandas knows how to take an ``ExtensionArray`` and +store it in a ``Series`` or a column of a ``DataFrame``. +See :ref:`basics.dtypes` for more. + While Series is ndarray-like, if you need an *actual* ndarray, then use :meth:`Series.to_numpy`. diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7da1c1aeef348..b1c96a3c23569 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -64,8 +64,11 @@ If you need an actual NumPy array, use :meth:`Series.to_numpy` or :meth:`Index.t idx.to_numpy() pd.Series(idx).to_numpy() -For Series and Indexes backed by normal NumPy arrays, this will be the same thing (and the same -as ``.values``). +For Series and Indexes backed by normal NumPy arrays, :attr:`Series.array` will return a +new :class:`arrays.PandasArray`, which is a thin (no-copy) wrapper around a +:class:`numpy.ndarray`. :class:`arrays.PandasArray` isn't especially useful on its own, +but it does provide the same interface as any extension array defined in pandas or by +a third-party library. .. ipython:: python @@ -74,7 +77,7 @@ as ``.values``). ser.to_numpy() We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we -recommend and using ``.array`` or ``.to_numpy()`` instead. +highly recommend and using ``.array`` or ``.to_numpy()`` instead. See :ref:`Dtypes ` and :ref:`Attributes and Underlying Data ` for more. diff --git a/pandas/__init__.py b/pandas/__init__.py index e86ed86fda74f..427157acb433f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -49,6 +49,7 @@ from pandas.io.api import * from pandas.util._tester import test import pandas.testing +import pandas.arrays # use the closest tagged version if possible from ._version import get_versions diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py new file mode 100644 index 0000000000000..f6a528bc87b54 --- /dev/null +++ b/pandas/arrays/__init__.py @@ -0,0 +1,11 @@ +""" +All of pandas' ExtensionArrays. + +See :ref:`extending.extension-types` for more. +""" +from pandas.core.arrays import PandasArray + + +__all__ = [ + 'PandasArray' +] diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ea8837332633a..c317786e7d633 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -9,3 +9,4 @@ from .integer import ( # noqa IntegerArray, integer_array) from .sparse import SparseArray # noqa +from .numpy_ import PandasArray, PandasDtype # noqa diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9a8b345cea1b3..75ce3af5c4218 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -16,11 +16,11 @@ from pandas.core.dtypes.cast import ( coerce_indexer_dtype, maybe_infer_to_datetimelike) from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_platform_int, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, - is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence, - is_timedelta64_dtype) + ensure_int64, ensure_object, ensure_platform_int, extract_array, + is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, + is_dict_like, is_dtype_equal, is_extension_array_dtype, is_float_dtype, + is_integer_dtype, is_iterator, is_list_like, is_object_dtype, is_scalar, + is_sequence, is_timedelta64_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCIndexClass, ABCSeries) @@ -2086,8 +2086,7 @@ def __setitem__(self, key, value): `Categorical` does not have the same categories """ - if isinstance(value, (ABCIndexClass, ABCSeries)): - value = value.array + value = extract_array(value, extract_numpy=True) # require identical categories set if isinstance(value, Categorical): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py new file mode 100644 index 0000000000000..0eef5263ffee7 --- /dev/null +++ b/pandas/core/arrays/numpy_.py @@ -0,0 +1,446 @@ +import numbers + +import numpy as np + +from pandas._libs import lib +from pandas.compat.numpy import function as nv + +from pandas.core.dtypes.common import extract_array +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_list_like + +from pandas import compat +from pandas.core import nanops + +from .base import ExtensionArray, ExtensionOpsMixin + + +class PandasDtype(ExtensionDtype): + """ + A Pandas ExtensionDtype for NumPy dtypes. + + .. versionadded:: 0.24.0 + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + dtype : numpy.dtype + """ + _metadata = ('_dtype',) + + def __init__(self, dtype): + dtype = np.dtype(dtype) + self._dtype = dtype + self._name = dtype.name + self._type = dtype.type + + @property + def name(self): + return self._name + + @property + def type(self): + return self._type + + @property + def _is_numeric(self): + # exclude object, str, unicode, void. + return self.kind in set('biufc') + + @property + def _is_boolean(self): + return self.kind == 'b' + + @classmethod + def construct_from_string(cls, string): + return cls(np.dtype(string)) + + def construct_array_type(cls): + return PandasArray + + @property + def kind(self): + return self._dtype.kind + + @property + def itemsize(self): + return self._dtype.itemsize + + +# TODO(NumPy1.13): remove this +# Compat for NumPy 1.12, which doesn't provide NDArrayOperatorsMixin +# or __array_ufunc__, so those operations won't be available to people +# on older NumPys. +# +# We would normally write this as bases=(...), then "class Foo(*bases): +# but Python2 doesn't allow unpacking tuples in the class statement. +# So, we fall back to "object", to avoid writing a metaclass. +try: + from numpy.lib.mixins import NDArrayOperatorsMixin +except ImportError: + NDArrayOperatorsMixin = object + + +class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): + """ + A pandas ExtensionArray for NumPy data. + + .. versionadded :: 0.24.0 + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + values : ndarray + The NumPy ndarray to wrap. Must be 1-dimensional. + + Notes + ----- + Operations like ``+`` and applying ufuncs requires NumPy>=1.13. + """ + # If you're wondering why pd.Series(cls) doesn't put the array in an + # ExtensionBlock, search for `ABCPandasArray`. We check for + # that _typ to ensure that that users don't unnecessarily use EAs inside + # pandas internals, which turns off things like block consolidation. + _typ = "npy_extension" + __array_priority__ = 1000 + + # ------------------------------------------------------------------------ + # Constructors + + def __init__(self, values): + if isinstance(values, type(self)): + values = values._ndarray + values = np.asarray(values) + if values.ndim != 1: + raise ValueError("PandasArray must be 1-dimensional.") + + self._ndarray = values + self._dtype = PandasDtype(values.dtype) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + if isinstance(dtype, PandasDtype): + dtype = dtype._dtype + + result = np.asarray(scalars, dtype=dtype) + if copy and result is scalars: + result = result.copy() + return cls(result) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate(to_concat)) + + # ------------------------------------------------------------------------ + # Data + + @property + def dtype(self): + return self._dtype + + # ------------------------------------------------------------------------ + # NumPy Array Interface + + def __array__(self, dtype=None): + return np.asarray(self._ndarray, dtype=dtype) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # Lightly modified version of + # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\ + # numpy.lib.mixins.NDArrayOperatorsMixin.html + # The primary modification is not boxing scalar return values + # in PandasArray, since pandas' ExtensionArrays are 1-d. + out = kwargs.get('out', ()) + for x in inputs + out: + # Only support operations with instances of _HANDLED_TYPES. + # Use PandasArray instead of type(self) for isinstance to + # allow subclasses that don't override __array_ufunc__ to + # handle PandasArray objects. + if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): + return NotImplemented + + # Defer to the implementation of the ufunc on unwrapped values. + inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x + for x in inputs) + if out: + kwargs['out'] = tuple( + x._ndarray if isinstance(x, PandasArray) else x + for x in out) + result = getattr(ufunc, method)(*inputs, **kwargs) + + if type(result) is tuple and len(result): + # multiple return values + if not lib.is_scalar(result[0]): + # re-box array-like results + return tuple(type(self)(x) for x in result) + else: + # but not scalar reductions + return result + elif method == 'at': + # no return value + return None + else: + # one return value + if not lib.is_scalar(result): + # re-box array-like results, but not scalar reductions + result = type(self)(result) + return result + + # ------------------------------------------------------------------------ + # Pandas ExtensionArray Interface + + def __getitem__(self, item): + if isinstance(item, type(self)): + item = item._ndarray + + result = self._ndarray[item] + if not lib.is_scalar(result): + result = type(self)(result) + return result + + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + + if not lib.is_scalar(key) and is_list_like(key): + key = np.asarray(key) + if not len(key): + # early return to avoid casting unnecessarily. + return + + if not lib.is_scalar(value): + value = np.asarray(value) + + values = self._ndarray + t = np.result_type(value, values) + if t != self._ndarray.dtype: + values = values.astype(t, casting='safe') + values[key] = value + self._dtype = PandasDtype(t) + self._ndarray = values + else: + self._ndarray[key] = value + + def __len__(self): + return len(self._ndarray) + + @property + def nbytes(self): + return self._ndarray.nbytes + + def isna(self): + from pandas import isna + + return isna(self._ndarray) + + def fillna(self, value=None, method=None, limit=None): + from pandas.api.types import is_array_like + from pandas.util._validators import validate_fillna_kwargs + from pandas.core.missing import pad_1d, backfill_1d + + # TODO(_values_for_fillna): remove this + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self._ndarray, limit=limit, + mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def take(self, indices, allow_fill=False, fill_value=None): + from pandas.core.algorithms import take + + result = take(self._ndarray, indices, allow_fill=allow_fill, + fill_value=fill_value) + return type(self)(result) + + def copy(self, deep=False): + return type(self)(self._ndarray.copy()) + + def _values_for_argsort(self): + return self._ndarray + + def _values_for_factorize(self): + return self._ndarray, -1 + + def unique(self): + from pandas import unique + + return type(self)(unique(self._ndarray)) + + # ------------------------------------------------------------------------ + # Reductions + + def _reduce(self, name, skipna=True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = ( + "'{}' does not implement reduction '{}'" + ) + raise TypeError(msg.format(type(self).__name__, name)) + + def any(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_any((), dict(out=out, keepdims=keepdims)) + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + + def all(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_all((), dict(out=out, keepdims=keepdims)) + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + + def min(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_min((), dict(out=out, keepdims=keepdims)) + return nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + + def max(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_max((), dict(out=out, keepdims=keepdims)) + return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + + def sum(self, axis=None, dtype=None, out=None, keepdims=False, + initial=None, skipna=True, min_count=0): + nv.validate_sum((), dict(dtype=dtype, out=out, keepdims=keepdims, + initial=initial)) + return nanops.nansum(self._ndarray, axis=axis, skipna=skipna, + min_count=min_count) + + def prod(self, axis=None, dtype=None, out=None, keepdims=False, + initial=None, skipna=True, min_count=0): + nv.validate_prod((), dict(dtype=dtype, out=out, keepdims=keepdims, + initial=initial)) + return nanops.nanprod(self._ndarray, axis=axis, skipna=skipna, + min_count=min_count) + + def mean(self, axis=None, dtype=None, out=None, keepdims=False, + skipna=True): + nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) + return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + + def median(self, axis=None, out=None, overwrite_input=False, + keepdims=False, skipna=True): + nv.validate_median((), dict(out=out, overwrite_input=overwrite_input, + keepdims=keepdims)) + return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + + def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, + skipna=True): + nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, + keepdims=keepdims), + fname='std') + return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, + ddof=ddof) + + def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, + skipna=True): + nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, + keepdims=keepdims), + fname='var') + return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, + ddof=ddof) + + def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, + skipna=True): + nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, + keepdims=keepdims), + fname='sem') + return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, + ddof=ddof) + + def kurt(self, axis=None, dtype=None, out=None, keepdims=False, + skipna=True): + nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, + keepdims=keepdims), + fname='kurt') + return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + + def skew(self, axis=None, dtype=None, out=None, keepdims=False, + skipna=True): + nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, + keepdims=keepdims), + fname='skew') + return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + + # ------------------------------------------------------------------------ + # Additional Methods + def to_numpy(self, dtype=None, copy=False): + """ + Convert the PandasArray to a :class:`numpy.ndarray`. + + By default, this requires no coercion or copying of data. + + Parameters + ---------- + dtype : numpy.dtype + The NumPy dtype to pass to :func:`numpy.asarray`. + copy : bool, default False + Whether to copy the underlying data. + + Returns + ------- + ndarray + """ + result = np.asarray(self._ndarray, dtype=dtype) + if copy and result is self._ndarray: + result = result.copy() + + return result + + # ------------------------------------------------------------------------ + # Ops + + def __invert__(self): + return type(self)(~self._ndarray) + + @classmethod + def _create_arithmetic_method(cls, op): + def arithmetic_method(self, other): + if isinstance(other, (ABCIndexClass, ABCSeries)): + return NotImplemented + + elif isinstance(other, cls): + other = other._ndarray + + with np.errstate(all="ignore"): + result = op(self._ndarray, other) + + if op is divmod: + a, b = result + return cls(a), cls(b) + + return cls(result) + + return compat.set_function_name(arithmetic_method, + "__{}__".format(op.__name__), + cls) + + _create_comparison_method = _create_arithmetic_method + + +PandasArray._add_arithmetic_ops() +PandasArray._add_comparison_ops() diff --git a/pandas/core/base.py b/pandas/core/base.py index 46f61c353056e..f24c235fa65ae 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -784,18 +784,21 @@ def base(self): @property def array(self): - # type: () -> Union[np.ndarray, ExtensionArray] + # type: () -> ExtensionArray """ - The actual Array backing this Series or Index. + The ExtensionArray of the data backing this Series or Index. .. versionadded:: 0.24.0 Returns ------- - array : numpy.ndarray or ExtensionArray - This is the actual array stored within this object. This differs - from ``.values`` which may require converting the data - to a different form. + array : ExtensionArray + An ExtensionArray of the values stored within. For extension + types, this is the actual array. For NumPy native types, this + is a thin (no copy) wrapper around :class:`numpy.ndarray`. + + ``.array`` differs ``.values`` which may require converting the + data to a different form. See Also -------- @@ -820,26 +823,39 @@ def array(self): For any 3rd-party extension types, the array type will be an ExtensionArray. - For all remaining dtypes ``.array`` will be the :class:`numpy.ndarray` + For all remaining dtypes ``.array`` will be a + :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray stored within. If you absolutely need a NumPy array (possibly with copying / coercing data), then use :meth:`Series.to_numpy` instead. - .. note:: - - ``.array`` will always return the underlying object backing the - Series or Index. If a future version of pandas adds a specialized - extension type for a data type, then the return type of ``.array`` - for that data type will change from an object-dtype ndarray to the - new ExtensionArray. - Examples -------- + + For regular NumPy types like int, and float, a PandasArray + is returned. + + >>> pd.Series([1, 2, 3]).array + + [1, 2, 3] + Length: 3, dtype: int64 + + For extension types, like Categorical, the actual ExtensionArray + is returned + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array [a, b, a] Categories (2, object): [a, b] """ - return self._values + result = self._values + + # TODO(DatetimeArray): remvoe the second clause. + if (not is_extension_array_dtype(result.dtype) + and not is_datetime64tz_dtype(result.dtype)): + from pandas.core.arrays.numpy_ import PandasArray + + result = PandasArray(result) + return result def to_numpy(self, dtype=None, copy=False): """ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e1141c6b6b3a8..cc83c1e5a3399 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -14,8 +14,8 @@ registry) from pandas.core.dtypes.generic import ( ABCCategorical, ABCCategoricalIndex, ABCDateOffset, ABCDatetimeIndex, - ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries, ABCSparseArray, - ABCSparseSeries) + ABCIndexClass, ABCPandasArray, ABCPeriodArray, ABCPeriodIndex, ABCSeries, + ABCSparseArray, ABCSparseSeries) from pandas.core.dtypes.inference import ( # noqa:F401 is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, @@ -2016,3 +2016,54 @@ def pandas_dtype(dtype): raise TypeError("dtype '{}' not understood".format(dtype)) return npdtype + + +def extract_array(obj, extract_numpy=False): + """ + Extract the ndarray or ExtensionArray from a Series or Index. + + For all other types, `obj` is just returned as is. + + Parameters + ---------- + obj : object + For Series / Index, the underlying ExtensionArray is unboxed. + For Numpy-backed ExtensionArrays, the ndarray is extracted. + + extract_numpy : bool, default False + Whether to extract the ndarray from a PandasArray + + Returns + ------- + arr : object + + Examples + -------- + >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + [a, b, c] + Categories (3, object): [a, b, c] + + Other objects like lists, arrays, and DataFrames are just passed through. + + >>> extract_array([1, 2, 3]) + [1, 2, 3] + + For an ndarray-backed Series / Index a PandasArray is returned. + + >>> extract_array(pd.Series([1, 2, 3])) + + [1, 2, 3] + Length: 3, dtype: int64 + + To extract all the way down to the ndarray, pass ``extract_numpy=True``. + + >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) + array([1, 2, 3]) + """ + if isinstance(obj, (ABCIndexClass, ABCSeries)): + obj = obj.array + + if extract_numpy and isinstance(obj, ABCPandasArray): + obj = obj._ndarray + + return obj diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 7a3ff5d295421..bbc447d6fa0da 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -67,7 +67,11 @@ def _check(cls, inst): ("extension", "categorical", "periodarray", + "npy_extension", )) +ABCPandasArray = create_pandas_abc_type("ABCPandasArray", + "_typ", + ("npy_extension",)) class _ABCGeneric(type): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cc6f182fadce6..6f121ba264f75 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -26,8 +26,8 @@ import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, - ABCMultiIndex, ABCPeriodIndex, ABCSeries, ABCTimedeltaArray, - ABCTimedeltaIndex) + ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries, + ABCTimedeltaArray, ABCTimedeltaIndex) from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import ops @@ -264,6 +264,9 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, return cls._simple_new(data, name) from .range import RangeIndex + if isinstance(data, ABCPandasArray): + # ensure users don't accidentally put a PandasArray in an index. + data = data._ndarray # range if isinstance(data, RangeIndex): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1383ce09bc2d0..6d27946476026 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,12 +18,13 @@ infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects) from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_dtype_equal, is_extension_array_dtype, is_extension_type, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype, - is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype) + _NS_DTYPE, _TD_DTYPE, ensure_platform_int, extract_array, is_bool_dtype, + is_categorical, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, + is_extension_type, is_float_dtype, is_integer, is_integer_dtype, + is_interval_dtype, is_list_like, is_numeric_v_string_like, is_object_dtype, + is_period_dtype, is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, + pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype) @@ -1969,22 +1970,19 @@ def shift(self, periods, axis=0): def where(self, other, cond, align=True, errors='raise', try_cast=False, axis=0, transpose=False): - # Extract the underlying arrays. - if isinstance(other, (ABCIndexClass, ABCSeries)): - other = other.array - - elif isinstance(other, ABCDataFrame): + if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. assert other.shape[1] == 1 - other = other.iloc[:, 0].array + other = other.iloc[:, 0] + + other = extract_array(other, extract_numpy=True) if isinstance(cond, ABCDataFrame): assert cond.shape[1] == 1 - cond = cond.iloc[:, 0].array + cond = cond.iloc[:, 0] - elif isinstance(cond, (ABCIndexClass, ABCSeries)): - cond = cond.array + cond = extract_array(cond, extract_numpy=True) if lib.is_scalar(other) and isna(other): # The default `other` for Series / Frame is np.nan diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c437456794f43..d5b1b0c93fed1 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -23,8 +23,8 @@ is_extension_array_dtype, is_extension_type, is_float_dtype, is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex) + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPandasArray, + ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex) from pandas.core.dtypes.missing import isna from pandas.core import algorithms, common as com @@ -578,7 +578,16 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): - subarr = data + if isinstance(data, ABCPandasArray): + # We don't want to let people put our PandasArray wrapper + # (the output of Series/Index.array), into a Series. So + # we explicitly unwrap it here. + subarr = data._ndarray + else: + subarr = data + + # everything else in this block must also handle ndarray's, + # becuase we've unwrapped PandasArray into an ndarray. if dtype is not None and not data.dtype.is_dtype(dtype): subarr = data.astype(dtype) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8319a8cc5417c..449d0b3d39e24 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -11,8 +11,9 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like, - is_object_dtype, needs_i8_conversion) + ensure_platform_int, extract_array, is_bool_dtype, + is_extension_array_dtype, is_list_like, is_object_dtype, + needs_i8_conversion) from pandas.core.dtypes.missing import notna from pandas import compat @@ -432,7 +433,7 @@ def _unstack_extension_series(series, level, fill_value): level=level, fill_value=-1).get_result() out = [] - values = series.array + values = extract_array(series, extract_numpy=False) for col, indices in result.iteritems(): out.append(Series(values.take(indices.values, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index da894a0881400..b733ce806981a 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -28,7 +28,7 @@ class TestPDApi(Base): ignored = ['tests', 'locale', 'conftest'] # top-level sub-packages - lib = ['api', 'compat', 'core', 'errors', 'pandas', + lib = ['api', 'arrays', 'compat', 'core', 'errors', 'pandas', 'plotting', 'test', 'testing', 'tseries', 'util', 'options', 'io'] diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py new file mode 100644 index 0000000000000..34ac325fef859 --- /dev/null +++ b/pandas/tests/arrays/test_numpy.py @@ -0,0 +1,93 @@ +""" +Additional tests for PandasArray that aren't covered by +the interface tests. +""" +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas.core.arrays import PandasArray +import pandas.util.testing as tm + + +def test_to_numpy(): + arr = PandasArray(np.array([1, 2, 3])) + result = arr.to_numpy() + assert result is arr._ndarray + + result = arr.to_numpy(copy=True) + assert result is not arr._ndarray + + result = arr.to_numpy(dtype='f8') + expected = np.array([1, 2, 3], dtype='f8') + tm.assert_numpy_array_equal(result, expected) + + +def test_setitem(): + ser = pd.Series([1, 2, 3]) + ser.array[0] = 10 + expected = pd.Series([10, 2, 3]) + tm.assert_series_equal(ser, expected) + + +def test_bad_reduce_raises(): + arr = np.array([1, 2, 3], dtype='int64') + arr = PandasArray(arr) + msg = "cannot perform not_a_method with type int" + with pytest.raises(TypeError, match=msg): + arr._reduce(msg) + + +def test_from_sequence_dtype(): + arr = np.array([1, 2, 3], dtype='int64') + result = PandasArray._from_sequence(arr, dtype='uint64') + expected = PandasArray(np.array([1, 2, 3], dtype='uint64')) + tm.assert_extension_array_equal(result, expected) + + +def test_validate_reduction_keyword_args(): + arr = PandasArray(np.array([1, 2, 3])) + msg = "the 'keepdims' parameter is not supported .*all" + with pytest.raises(ValueError, match=msg): + arr.all(keepdims=True) + + +@td.skip_if_no("numpy", min_version="1.13.0") +def test_ufunc(): + arr = PandasArray([-1.0, 0.0, 1.0]) + result = np.abs(arr) + expected = PandasArray(np.abs(arr._ndarray)) + tm.assert_extension_array_equal(result, expected) + + r1, r2 = np.divmod(arr, np.add(arr, 2)) + e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2)) + e1 = PandasArray(e1) + e2 = PandasArray(e2) + tm.assert_extension_array_equal(r1, e1) + tm.assert_extension_array_equal(r2, e2) + + +@td.skip_if_no("numpy", min_version="1.13.0") +def test_basic_binop(): + # Just a basic smoke test. The EA interface tests exercise this + # more thoroughly. + x = PandasArray([1, 2, 3]) + result = x + x + expected = PandasArray([2, 4, 6]) + tm.assert_extension_array_equal(result, expected) + + +def test_series_constructor_with_copy(): + ndarray = np.array([1, 2, 3]) + ser = pd.Series(PandasArray(ndarray), copy=True) + + assert ser.values is not ndarray + + +def test_series_constructor_with_astype(): + ndarray = np.array([1, 2, 3]) + result = pd.Series(PandasArray(ndarray), dtype="float64") + expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py new file mode 100644 index 0000000000000..70a3a8ab58aac --- /dev/null +++ b/pandas/tests/extension/test_numpy.py @@ -0,0 +1,212 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import compat +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +import pandas.util.testing as tm + +from . import base + + +@pytest.fixture +def dtype(): + return PandasDtype(np.dtype('float')) + + +@pytest.fixture +def allow_in_pandas(monkeypatch): + """ + A monkeypatch to tells pandas to let us in. + + By default, passing a PandasArray to an index / series / frame + constructor will unbox that PandasArray to an ndarray, and treat + it as a non-EA column. We don't want people using EAs without + reason. + + The mechanism for this is a check against ABCPandasArray + in each constructor. + + But, for testing, we need to allow them in pandas. So we patch + the _typ of PandasArray, so that we evade the ABCPandasArray + check. + """ + with monkeypatch.context() as m: + m.setattr(PandasArray, '_typ', 'extension') + yield + + +@pytest.fixture +def data(allow_in_pandas, dtype): + return PandasArray(np.arange(1, 101, dtype=dtype._dtype)) + + +@pytest.fixture +def data_missing(allow_in_pandas): + return PandasArray(np.array([np.nan, 1.0])) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def na_cmp(): + def cmp(a, b): + return np.isnan(a) and np.isnan(b) + return cmp + + +@pytest.fixture +def data_for_sorting(allow_in_pandas): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + return PandasArray( + np.array([1, 2, 0]) + ) + + +@pytest.fixture +def data_missing_for_sorting(allow_in_pandas): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + return PandasArray( + np.array([1, np.nan, 0]) + ) + + +@pytest.fixture +def data_for_grouping(allow_in_pandas): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + a, b, c = np.arange(3) + return PandasArray(np.array( + [b, b, np.nan, np.nan, a, a, b, c] + )) + + +class BaseNumPyTests(object): + pass + + +class TestCasting(BaseNumPyTests, base.BaseCastingTests): + pass + + +class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): + @pytest.mark.skip(reason="We don't register our dtype") + # We don't want to register. This test should probably be split in two. + def test_from_dtype(self, data): + pass + + +class TestDtype(BaseNumPyTests, base.BaseDtypeTests): + + @pytest.mark.skip(reason="Incorrect expected.") + # we unsurprisingly clash with a NumPy name. + def test_check_dtype(self, data): + pass + + +class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): + pass + + +class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): + pass + + +class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): + pass + + +class TestMethods(BaseNumPyTests, base.BaseMethodsTests): + + @pytest.mark.skip(reason="TODO: remove?") + def test_value_counts(self, all_data, dropna): + pass + + @pytest.mark.skip(reason="Incorrect expected") + # We have a bool dtype, so the result is an ExtensionArray + # but expected is not + def test_combine_le(self, data_repeated): + super(TestMethods, self).test_combine_le(data_repeated) + + +class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests): + divmod_exc = None + series_scalar_exc = None + frame_scalar_exc = None + series_array_exc = None + + def test_divmod_series_array(self, data): + s = pd.Series(data) + self._check_divmod_op(s, divmod, data, exc=None) + + @pytest.mark.skip("We implement ops") + def test_error(self, data, all_arithmetic_operators): + pass + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + if (compat.PY2 and + all_arithmetic_operators in {'__div__', '__rdiv__'}): + raise pytest.skip( + "Matching NumPy int / int -> float behavior." + ) + super(TestArithmetics, self).test_arith_series_with_scalar( + data, all_arithmetic_operators + ) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + if (compat.PY2 and + all_arithmetic_operators in {'__div__', '__rdiv__'}): + raise pytest.skip( + "Matching NumPy int / int -> float behavior." + ) + super(TestArithmetics, self).test_arith_series_with_array( + data, all_arithmetic_operators + ) + + +class TestPrinting(BaseNumPyTests, base.BasePrintingTests): + pass + + +class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests): + + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + # avoid coercing int -> float. Just cast to the actual numpy type. + expected = getattr(s.astype(s.dtype._dtype), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests): + pass + + +class TestMising(BaseNumPyTests, base.BaseMissingTests): + pass + + +class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): + + @pytest.mark.skip("Incorrect parent test") + # not actually a mixed concat, since we concat int and int. + def test_concat_mixed_dtypes(self, data): + super(TestReshaping, self).test_concat_mixed_dtypes(data) + + +class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): + pass diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fa1117a647850..f8c09ff8614bb 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -23,6 +23,7 @@ import pandas as pd import pandas.util.testing as tm from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.internals.blocks import IntBlock from pandas.tests.frame.common import TestData @@ -2165,6 +2166,15 @@ def test_constructor_range_dtype(self, dtype): result = DataFrame({'A': range(5)}, dtype=dtype) tm.assert_frame_equal(result, expected) + def test_constructor_no_numpy_backed_ea(self): + # Ensure that PandasArray isn't allowed inside Series + # See https://github.com/pandas-dev/pandas/issues/23995 for more. + arr = pd.Series([1, 2, 3]).array + result = pd.DataFrame({"A": arr}) + expected = pd.DataFrame({"A": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + assert isinstance(result._data.blocks[0], IntBlock) + def test_frame_from_list_subclass(self): # GH21226 class List(list): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2580a47e8fdd3..7c52a8a3e5d32 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -260,6 +260,12 @@ def test_constructor_int_dtype_nan_raises(self, dtype): with pytest.raises(ValueError, match=msg): Index(data, dtype=dtype) + def test_constructor_no_numpy_backed_ea(self): + ser = pd.Series([1, 2, 3]) + result = pd.Index(ser.array) + expected = pd.Index([1, 2, 3]) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("klass,dtype,na_val", [ (pd.Float64Index, np.float64, np.nan), (pd.DatetimeIndex, 'datetime64[ns]', pd.NaT) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f5a445e2cca9a..b29c86d5ea02d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -22,6 +22,7 @@ Timestamp, date_range, isna, period_range, timedelta_range) from pandas.api.types import CategoricalDtype from pandas.core.arrays import period_array +from pandas.core.internals.blocks import IntBlock import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -1238,3 +1239,9 @@ def test_constructor_tz_mixed_data(self): result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected) + + def test_constructor_no_numpy_backed_ea(self): + ser = pd.Series([1, 2, 3]) + result = pd.Series(ser.array) + tm.assert_series_equal(ser, result) + assert isinstance(result._data.blocks[0], IntBlock) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 6eada0e89b506..448c2b2222e19 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -18,6 +18,7 @@ CategoricalIndex, Timestamp) from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat +from pandas.core.arrays import PandasArray from pandas.core.accessor import PandasDelegate from pandas.core.base import PandasObject, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -1252,8 +1253,24 @@ def test_ndarray_values(array, expected): tm.assert_numpy_array_equal(l_values, expected) +@pytest.mark.parametrize("arr", [ + np.array([1, 2, 3]), + np.array([1, 2, 3], dtype="datetime64[ns]"), +]) +def test_numpy_array(arr): + ser = pd.Series(arr) + result = ser.array + expected = PandasArray(arr) + tm.assert_extension_array_equal(result, expected) + + +def test_numpy_array_all_dtypes(any_numpy_dtype): + ser = pd.Series(dtype=any_numpy_dtype) + result = ser.array + assert isinstance(result, PandasArray) + + @pytest.mark.parametrize("array, attr", [ - (np.array([1, 2], dtype=np.int64), None), (pd.Categorical(['a', 'b']), '_codes'), (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'), (pd.core.arrays.integer_array([0, np.nan]), '_data'), From 2c615b07e2ecc45a74157820fb38947b0cf2933c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 26 Dec 2018 11:28:47 -0600 Subject: [PATCH 03/11] move tests --- pandas/tests/frame/test_block_internals.py | 10 ++++++++++ pandas/tests/frame/test_constructors.py | 10 ---------- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/series/test_constructors.py | 7 ------- pandas/tests/series/test_internals.py | 7 +++++++ 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 647077a0428f3..2817a46058762 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -12,6 +12,7 @@ from pandas import (DataFrame, Series, Timestamp, date_range, compat, option_context, Categorical) +from pandas.core.internals.blocks import IntBlock from pandas.core.arrays import IntervalArray, integer_array from pandas.compat import StringIO import pandas as pd @@ -579,3 +580,12 @@ def test_strange_column_corruption_issue(self): first = len(df.loc[pd.isna(df[myid]), [myid]]) second = len(df.loc[pd.isna(df[myid]), [myid]]) assert first == second == 0 + + def test_constructor_no_pandas_array(self): + # Ensure that PandasArray isn't allowed inside Series + # See https://github.com/pandas-dev/pandas/issues/23995 for more. + arr = pd.Series([1, 2, 3]).array + result = pd.DataFrame({"A": arr}) + expected = pd.DataFrame({"A": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + assert isinstance(result._data.blocks[0], IntBlock) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f8c09ff8614bb..fa1117a647850 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -23,7 +23,6 @@ import pandas as pd import pandas.util.testing as tm from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas.core.internals.blocks import IntBlock from pandas.tests.frame.common import TestData @@ -2166,15 +2165,6 @@ def test_constructor_range_dtype(self, dtype): result = DataFrame({'A': range(5)}, dtype=dtype) tm.assert_frame_equal(result, expected) - def test_constructor_no_numpy_backed_ea(self): - # Ensure that PandasArray isn't allowed inside Series - # See https://github.com/pandas-dev/pandas/issues/23995 for more. - arr = pd.Series([1, 2, 3]).array - result = pd.DataFrame({"A": arr}) - expected = pd.DataFrame({"A": [1, 2, 3]}) - tm.assert_frame_equal(result, expected) - assert isinstance(result._data.blocks[0], IntBlock) - def test_frame_from_list_subclass(self): # GH21226 class List(list): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e5a9805eb177c..5b6ae99db083b 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -258,7 +258,7 @@ def test_constructor_int_dtype_nan_raises(self, dtype): with pytest.raises(ValueError, match=msg): Index(data, dtype=dtype) - def test_constructor_no_numpy_backed_ea(self): + def test_constructor_no_pandas_array(self): ser = pd.Series([1, 2, 3]) result = pd.Index(ser.array) expected = pd.Index([1, 2, 3]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index b29c86d5ea02d..f5a445e2cca9a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -22,7 +22,6 @@ Timestamp, date_range, isna, period_range, timedelta_range) from pandas.api.types import CategoricalDtype from pandas.core.arrays import period_array -from pandas.core.internals.blocks import IntBlock import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -1239,9 +1238,3 @@ def test_constructor_tz_mixed_data(self): result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected) - - def test_constructor_no_numpy_backed_ea(self): - ser = pd.Series([1, 2, 3]) - result = pd.Series(ser.array) - tm.assert_series_equal(ser, result) - assert isinstance(result._data.blocks[0], IntBlock) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 21094c0079d41..31cbea8f95090 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -8,6 +8,7 @@ import pandas as pd from pandas import NaT, Series, Timestamp +from pandas.core.internals.blocks import IntBlock import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -306,6 +307,12 @@ def test_convert_preserve_all_bool(self): e = Series([False, True, False, False], dtype=bool) tm.assert_series_equal(r, e) + def test_constructor_no_pandas_array(self): + ser = pd.Series([1, 2, 3]) + result = pd.Series(ser.array) + tm.assert_series_equal(ser, result) + assert isinstance(result._data.blocks[0], IntBlock) + def test_hasnans_unchached_for_series(): # GH#19700 From 7efafa6e99ed26e9bd3ec4a1bbee9f1a82d00537 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Dec 2018 14:36:19 -0600 Subject: [PATCH 04/11] use to_numpy --- pandas/core/indexes/base.py | 2 +- pandas/core/internals/construction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dddfbd74cbeca..71fb8c7179109 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -266,7 +266,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from .range import RangeIndex if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. - data = data._ndarray + data = data.to_numpy() # range if isinstance(data, RangeIndex): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d5b1b0c93fed1..b18b966406bbb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -582,7 +582,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. - subarr = data._ndarray + subarr = data.to_numpy() else: subarr = data From bc50dc057eb0906ac8be9b0d8a597fb5811f5d53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Dec 2018 14:41:26 -0600 Subject: [PATCH 05/11] Move to internals --- pandas/core/arrays/categorical.py | 11 ++++--- pandas/core/arrays/numpy_.py | 3 +- pandas/core/dtypes/common.py | 55 ++----------------------------- pandas/core/internals/arrays.py | 55 +++++++++++++++++++++++++++++++ pandas/core/internals/blocks.py | 14 ++++---- pandas/core/reshape/reshape.py | 6 ++-- 6 files changed, 75 insertions(+), 69 deletions(-) create mode 100644 pandas/core/internals/arrays.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d3e2e0b9b02eb..a47406cded7b4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -16,11 +16,11 @@ from pandas.core.dtypes.cast import ( coerce_indexer_dtype, maybe_infer_to_datetimelike) from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_platform_int, extract_array, - is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, - is_dict_like, is_dtype_equal, is_extension_array_dtype, is_float_dtype, - is_integer_dtype, is_iterator, is_list_like, is_object_dtype, is_scalar, - is_sequence, is_timedelta64_dtype) + ensure_int64, ensure_object, ensure_platform_int, is_categorical, + is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, + is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence, + is_timedelta64_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCIndexClass, ABCSeries) @@ -2091,6 +2091,7 @@ def __setitem__(self, key, value): If (one or more) Value is not in categories or if a assigned `Categorical` does not have the same categories """ + from pandas.core.internals.arrays import extract_array value = extract_array(value, extract_numpy=True) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0eef5263ffee7..981d971acf038 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -5,7 +5,6 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv -from pandas.core.dtypes.common import extract_array from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_list_like @@ -210,6 +209,8 @@ def __getitem__(self, item): return result def __setitem__(self, key, value): + from pandas.core.internals.arrays import extract_array + value = extract_array(value, extract_numpy=True) if not lib.is_scalar(key) and is_list_like(key): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index cc83c1e5a3399..e1141c6b6b3a8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -14,8 +14,8 @@ registry) from pandas.core.dtypes.generic import ( ABCCategorical, ABCCategoricalIndex, ABCDateOffset, ABCDatetimeIndex, - ABCIndexClass, ABCPandasArray, ABCPeriodArray, ABCPeriodIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries) + ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries, ABCSparseArray, + ABCSparseSeries) from pandas.core.dtypes.inference import ( # noqa:F401 is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, @@ -2016,54 +2016,3 @@ def pandas_dtype(dtype): raise TypeError("dtype '{}' not understood".format(dtype)) return npdtype - - -def extract_array(obj, extract_numpy=False): - """ - Extract the ndarray or ExtensionArray from a Series or Index. - - For all other types, `obj` is just returned as is. - - Parameters - ---------- - obj : object - For Series / Index, the underlying ExtensionArray is unboxed. - For Numpy-backed ExtensionArrays, the ndarray is extracted. - - extract_numpy : bool, default False - Whether to extract the ndarray from a PandasArray - - Returns - ------- - arr : object - - Examples - -------- - >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) - [a, b, c] - Categories (3, object): [a, b, c] - - Other objects like lists, arrays, and DataFrames are just passed through. - - >>> extract_array([1, 2, 3]) - [1, 2, 3] - - For an ndarray-backed Series / Index a PandasArray is returned. - - >>> extract_array(pd.Series([1, 2, 3])) - - [1, 2, 3] - Length: 3, dtype: int64 - - To extract all the way down to the ndarray, pass ``extract_numpy=True``. - - >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) - array([1, 2, 3]) - """ - if isinstance(obj, (ABCIndexClass, ABCSeries)): - obj = obj.array - - if extract_numpy and isinstance(obj, ABCPandasArray): - obj = obj._ndarray - - return obj diff --git a/pandas/core/internals/arrays.py b/pandas/core/internals/arrays.py new file mode 100644 index 0000000000000..18af328bfa77f --- /dev/null +++ b/pandas/core/internals/arrays.py @@ -0,0 +1,55 @@ +""" +Methods for cleaning, validating, and unboxing arrays. +""" +from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries + + +def extract_array(obj, extract_numpy=False): + """ + Extract the ndarray or ExtensionArray from a Series or Index. + + For all other types, `obj` is just returned as is. + + Parameters + ---------- + obj : object + For Series / Index, the underlying ExtensionArray is unboxed. + For Numpy-backed ExtensionArrays, the ndarray is extracted. + + extract_numpy : bool, default False + Whether to extract the ndarray from a PandasArray + + Returns + ------- + arr : object + + Examples + -------- + >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + [a, b, c] + Categories (3, object): [a, b, c] + + Other objects like lists, arrays, and DataFrames are just passed through. + + >>> extract_array([1, 2, 3]) + [1, 2, 3] + + For an ndarray-backed Series / Index a PandasArray is returned. + + >>> extract_array(pd.Series([1, 2, 3])) + + [1, 2, 3] + Length: 3, dtype: int64 + + To extract all the way down to the ndarray, pass ``extract_numpy=True``. + + >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) + array([1, 2, 3]) + """ + if isinstance(obj, (ABCIndexClass, ABCSeries)): + obj = obj.array + + if extract_numpy and isinstance(obj, ABCPandasArray): + obj = obj.to_numpy() + + return obj diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 011013b07a7f2..375b4ccbc122f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,13 +18,12 @@ infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects) from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_platform_int, extract_array, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_extension_type, is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_v_string_like, is_object_dtype, - is_period_dtype, is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, - pandas_dtype) + _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_dtype_equal, is_extension_array_dtype, is_extension_type, + is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, + is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype, + is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype) @@ -41,6 +40,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_setitem_lengths +from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 31afdc814d082..70161826696c5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -11,9 +11,8 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - ensure_platform_int, extract_array, is_bool_dtype, - is_extension_array_dtype, is_integer_dtype, is_list_like, is_object_dtype, - needs_i8_conversion) + ensure_platform_int, is_bool_dtype, is_extension_array_dtype, + is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import notna from pandas import compat @@ -22,6 +21,7 @@ from pandas.core.arrays.categorical import _factorize_from_iterable from pandas.core.frame import DataFrame from pandas.core.index import Index, MultiIndex +from pandas.core.internals.arrays import extract_array from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, get_compressed_ids, From d38fd19a6ec4298bddecd511c8ae2122cd84188c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Dec 2018 14:47:20 -0600 Subject: [PATCH 06/11] Remove check --- pandas/core/arrays/numpy_.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 981d971acf038..e3bb92979b0bd 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -215,9 +215,6 @@ def __setitem__(self, key, value): if not lib.is_scalar(key) and is_list_like(key): key = np.asarray(key) - if not len(key): - # early return to avoid casting unnecessarily. - return if not lib.is_scalar(value): value = np.asarray(value) From bd70a1953f14b883534411330b5cad7a4d27af2d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 07:23:32 -0600 Subject: [PATCH 07/11] fixup --- pandas/core/arrays/numpy_.py | 12 ++++++------ pandas/tests/arrays/test_numpy.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e3bb92979b0bd..758a30243b9f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -4,13 +4,15 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.inference import is_array_like, is_list_like from pandas import compat from pandas.core import nanops +from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -114,7 +116,9 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): def __init__(self, values): if isinstance(values, type(self)): values = values._ndarray - values = np.asarray(values) + if not isinstance(values, np.ndarray): + raise ValueError("'values' must be a NumPy array.") + if values.ndim != 1: raise ValueError("PandasArray must be 1-dimensional.") @@ -242,10 +246,6 @@ def isna(self): return isna(self._ndarray) def fillna(self, value=None, method=None, limit=None): - from pandas.api.types import is_array_like - from pandas.util._validators import validate_fillna_kwargs - from pandas.core.missing import pad_1d, backfill_1d - # TODO(_values_for_fillna): remove this value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 34ac325fef859..e2f747693a390 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -56,7 +56,7 @@ def test_validate_reduction_keyword_args(): @td.skip_if_no("numpy", min_version="1.13.0") def test_ufunc(): - arr = PandasArray([-1.0, 0.0, 1.0]) + arr = PandasArray(np.array([-1.0, 0.0, 1.0])) result = np.abs(arr) expected = PandasArray(np.abs(arr._ndarray)) tm.assert_extension_array_equal(result, expected) @@ -73,12 +73,17 @@ def test_ufunc(): def test_basic_binop(): # Just a basic smoke test. The EA interface tests exercise this # more thoroughly. - x = PandasArray([1, 2, 3]) + x = PandasArray(np.array([1, 2, 3])) result = x + x - expected = PandasArray([2, 4, 6]) + expected = PandasArray(np.array([2, 4, 6])) tm.assert_extension_array_equal(result, expected) +def test_constructor_no_coercion(): + with pytest.raises(ValueError, match='NumPy array'): + PandasArray([1, 2, 3]) + + def test_series_constructor_with_copy(): ndarray = np.array([1, 2, 3]) ser = pd.Series(PandasArray(ndarray), copy=True) From 7e3ff59770104f15cc6f7240e76451fb1a20c6a3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 09:04:51 -0600 Subject: [PATCH 08/11] wip --- pandas/tests/arrays/test_numpy.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index e2f747693a390..3be68a18e8b13 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -8,7 +8,8 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas.core.arrays import PandasArray +from pandas.arrays import PandasArray +from pandas.core.arrays.numpy_ import PandasDtype import pandas.util.testing as tm @@ -96,3 +97,20 @@ def test_series_constructor_with_astype(): result = pd.Series(PandasArray(ndarray), dtype="float64") expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype, expected', [ + ('bool', True), + ('int', True), + ('uint', True), + ('float', True), + ('complex', True), + ('str', False), + ('bytes', False), + ('datetime64[ns]', False), + ('object', False), + ('void', False) +]) +def test_is_numeric(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_numeric is expected From cac2a8b68ea192618d74aedabda62cf1b3d8e0dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 09:45:45 -0600 Subject: [PATCH 09/11] Added constructor and setitem and dtype tests --- pandas/core/arrays/numpy_.py | 11 +++++- pandas/tests/arrays/test_numpy.py | 59 ++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 758a30243b9f3..b1dc77e65eee8 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -38,6 +38,10 @@ def __init__(self, dtype): self._name = dtype.name self._type = dtype.type + @property + def numpy_dtype(self): + return self._dtype + @property def name(self): return self._name @@ -98,6 +102,8 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): ---------- values : ndarray The NumPy ndarray to wrap. Must be 1-dimensional. + copy : bool, default False + Whether to copy `values`. Notes ----- @@ -113,7 +119,7 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): # ------------------------------------------------------------------------ # Constructors - def __init__(self, values): + def __init__(self, values, copy=False): if isinstance(values, type(self)): values = values._ndarray if not isinstance(values, np.ndarray): @@ -122,6 +128,9 @@ def __init__(self, values): if values.ndim != 1: raise ValueError("PandasArray must be 1-dimensional.") + if copy: + values = values.copy() + self._ndarray = values self._dtype = PandasDtype(values.dtype) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 3be68a18e8b13..8ab120ce0ccea 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -26,7 +26,7 @@ def test_to_numpy(): tm.assert_numpy_array_equal(result, expected) -def test_setitem(): +def test_setitem_series(): ser = pd.Series([1, 2, 3]) ser.array[0] = 10 expected = pd.Series([10, 2, 3]) @@ -99,6 +99,46 @@ def test_series_constructor_with_astype(): tm.assert_series_equal(result, expected) +@pytest.fixture(params=[ + np.array(['a', 'b'], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1+2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype='datetime64[ns]'), + np.array([0, 1], dtype='timedelta64[ns]'), +]) +def any_numpy_array(request): + """Parametrized fixture for NumPy arrays with different dtypes. + + This excludes string and bytes. + """ + return request.param + + +def test_constructor_copy(): + arr = np.array([0, 1]) + result = PandasArray(arr, copy=True) + + assert np.shares_memory(result._ndarray, arr) is False + + +def test_constructor_with_data(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr) + assert arr.dtype.numpy_dtype == nparr.dtype + + +def test_setitem(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr, copy=True) + + arr[0] = arr[1] + nparr[0] = nparr[1] + + tm.assert_numpy_array_equal(arr.to_numpy(), nparr) + + @pytest.mark.parametrize('dtype, expected', [ ('bool', True), ('int', True), @@ -114,3 +154,20 @@ def test_series_constructor_with_astype(): def test_is_numeric(dtype, expected): dtype = PandasDtype(dtype) assert dtype._is_numeric is expected + + +@pytest.mark.parametrize('dtype, expected', [ + ('bool', True), + ('int', False), + ('uint', False), + ('float', False), + ('complex', False), + ('str', False), + ('bytes', False), + ('datetime64[ns]', False), + ('object', False), + ('void', False) +]) +def test_is_boolean(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_boolean is expected From df02695213c2ec084af838ea3566a2d155ed711b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 09:45:55 -0600 Subject: [PATCH 10/11] Reorganize --- pandas/tests/arrays/test_numpy.py | 219 ++++++++++++++++-------------- 1 file changed, 119 insertions(+), 100 deletions(-) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 8ab120ce0ccea..72b6bdef27964 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -13,6 +13,106 @@ import pandas.util.testing as tm +@pytest.fixture(params=[ + np.array(['a', 'b'], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1 + 2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype='datetime64[ns]'), + np.array([0, 1], dtype='timedelta64[ns]'), +]) +def any_numpy_array(request): + """ + Parametrized fixture for NumPy arrays with different dtypes. + + This excludes string and bytes. + """ + return request.param + + +# ---------------------------------------------------------------------------- +# PandasDtype + +@pytest.mark.parametrize('dtype, expected', [ + ('bool', True), + ('int', True), + ('uint', True), + ('float', True), + ('complex', True), + ('str', False), + ('bytes', False), + ('datetime64[ns]', False), + ('object', False), + ('void', False) +]) +def test_is_numeric(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_numeric is expected + + +@pytest.mark.parametrize('dtype, expected', [ + ('bool', True), + ('int', False), + ('uint', False), + ('float', False), + ('complex', False), + ('str', False), + ('bytes', False), + ('datetime64[ns]', False), + ('object', False), + ('void', False) +]) +def test_is_boolean(dtype, expected): + dtype = PandasDtype(dtype) + assert dtype._is_boolean is expected + + +# ---------------------------------------------------------------------------- +# Construction + +def test_constructor_no_coercion(): + with pytest.raises(ValueError, match='NumPy array'): + PandasArray([1, 2, 3]) + + +def test_series_constructor_with_copy(): + ndarray = np.array([1, 2, 3]) + ser = pd.Series(PandasArray(ndarray), copy=True) + + assert ser.values is not ndarray + + +def test_series_constructor_with_astype(): + ndarray = np.array([1, 2, 3]) + result = pd.Series(PandasArray(ndarray), dtype="float64") + expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_from_sequence_dtype(): + arr = np.array([1, 2, 3], dtype='int64') + result = PandasArray._from_sequence(arr, dtype='uint64') + expected = PandasArray(np.array([1, 2, 3], dtype='uint64')) + tm.assert_extension_array_equal(result, expected) + + +def test_constructor_copy(): + arr = np.array([0, 1]) + result = PandasArray(arr, copy=True) + + assert np.shares_memory(result._ndarray, arr) is False + + +def test_constructor_with_data(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr) + assert arr.dtype.numpy_dtype == nparr.dtype + + +# ---------------------------------------------------------------------------- +# Conversion + def test_to_numpy(): arr = PandasArray(np.array([1, 2, 3])) result = arr.to_numpy() @@ -26,6 +126,9 @@ def test_to_numpy(): tm.assert_numpy_array_equal(result, expected) +# ---------------------------------------------------------------------------- +# Setitem + def test_setitem_series(): ser = pd.Series([1, 2, 3]) ser.array[0] = 10 @@ -33,6 +136,19 @@ def test_setitem_series(): tm.assert_series_equal(ser, expected) +def test_setitem(any_numpy_array): + nparr = any_numpy_array + arr = PandasArray(nparr, copy=True) + + arr[0] = arr[1] + nparr[0] = nparr[1] + + tm.assert_numpy_array_equal(arr.to_numpy(), nparr) + + +# ---------------------------------------------------------------------------- +# Reductions + def test_bad_reduce_raises(): arr = np.array([1, 2, 3], dtype='int64') arr = PandasArray(arr) @@ -41,13 +157,6 @@ def test_bad_reduce_raises(): arr._reduce(msg) -def test_from_sequence_dtype(): - arr = np.array([1, 2, 3], dtype='int64') - result = PandasArray._from_sequence(arr, dtype='uint64') - expected = PandasArray(np.array([1, 2, 3], dtype='uint64')) - tm.assert_extension_array_equal(result, expected) - - def test_validate_reduction_keyword_args(): arr = PandasArray(np.array([1, 2, 3])) msg = "the 'keepdims' parameter is not supported .*all" @@ -55,6 +164,9 @@ def test_validate_reduction_keyword_args(): arr.all(keepdims=True) +# ---------------------------------------------------------------------------- +# Ops + @td.skip_if_no("numpy", min_version="1.13.0") def test_ufunc(): arr = PandasArray(np.array([-1.0, 0.0, 1.0])) @@ -78,96 +190,3 @@ def test_basic_binop(): result = x + x expected = PandasArray(np.array([2, 4, 6])) tm.assert_extension_array_equal(result, expected) - - -def test_constructor_no_coercion(): - with pytest.raises(ValueError, match='NumPy array'): - PandasArray([1, 2, 3]) - - -def test_series_constructor_with_copy(): - ndarray = np.array([1, 2, 3]) - ser = pd.Series(PandasArray(ndarray), copy=True) - - assert ser.values is not ndarray - - -def test_series_constructor_with_astype(): - ndarray = np.array([1, 2, 3]) - result = pd.Series(PandasArray(ndarray), dtype="float64") - expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") - tm.assert_series_equal(result, expected) - - -@pytest.fixture(params=[ - np.array(['a', 'b'], dtype=object), - np.array([0, 1], dtype=float), - np.array([0, 1], dtype=int), - np.array([0, 1+2j], dtype=complex), - np.array([True, False], dtype=bool), - np.array([0, 1], dtype='datetime64[ns]'), - np.array([0, 1], dtype='timedelta64[ns]'), -]) -def any_numpy_array(request): - """Parametrized fixture for NumPy arrays with different dtypes. - - This excludes string and bytes. - """ - return request.param - - -def test_constructor_copy(): - arr = np.array([0, 1]) - result = PandasArray(arr, copy=True) - - assert np.shares_memory(result._ndarray, arr) is False - - -def test_constructor_with_data(any_numpy_array): - nparr = any_numpy_array - arr = PandasArray(nparr) - assert arr.dtype.numpy_dtype == nparr.dtype - - -def test_setitem(any_numpy_array): - nparr = any_numpy_array - arr = PandasArray(nparr, copy=True) - - arr[0] = arr[1] - nparr[0] = nparr[1] - - tm.assert_numpy_array_equal(arr.to_numpy(), nparr) - - -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', True), - ('uint', True), - ('float', True), - ('complex', True), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False) -]) -def test_is_numeric(dtype, expected): - dtype = PandasDtype(dtype) - assert dtype._is_numeric is expected - - -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', False), - ('uint', False), - ('float', False), - ('complex', False), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False) -]) -def test_is_boolean(dtype, expected): - dtype = PandasDtype(dtype) - assert dtype._is_boolean is expected From 35f50a57a7468068e191f23683283d55857d9729 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 11:40:20 -0600 Subject: [PATCH 11/11] py2 compat --- pandas/tests/arrays/test_numpy.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 72b6bdef27964..b17e509c24e71 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -8,6 +8,7 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import compat from pandas.arrays import PandasArray from pandas.core.arrays.numpy_ import PandasDtype import pandas.util.testing as tm @@ -41,10 +42,11 @@ def any_numpy_array(request): ('float', True), ('complex', True), ('str', False), - ('bytes', False), + pytest.param('bytes', False, + marks=pytest.mark.skipif(compat.PY2, reason="PY2")), ('datetime64[ns]', False), ('object', False), - ('void', False) + ('void', False), ]) def test_is_numeric(dtype, expected): dtype = PandasDtype(dtype) @@ -58,7 +60,8 @@ def test_is_numeric(dtype, expected): ('float', False), ('complex', False), ('str', False), - ('bytes', False), + pytest.param('bytes', False, + marks=pytest.mark.skipif(compat.PY2, reason="PY2")), ('datetime64[ns]', False), ('object', False), ('void', False)