diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 624045a3d64bc..d616e3f92aa4d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,11 +15,12 @@ is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, is_object_dtype, + is_extension_array_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, - is_categorical, is_datetimetz, + is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, @@ -547,7 +548,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if is_categorical_dtype(values) or is_sparse(values): # handle Categorical and sparse, - result = Series(values).values.value_counts(dropna=dropna) + result = Series(values)._values.value_counts(dropna=dropna) result.name = name counts = result.values @@ -1292,10 +1293,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ Specialized Cython take which sets NaN values in one pass + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. + Parameters ---------- - arr : ndarray - Input array + arr : array-like + Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value @@ -1315,17 +1319,25 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : array-like + May be the same type as the input, or cast to an ndarray. """ + # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes - if is_categorical(arr): - return arr.take_nd(indexer, fill_value=fill_value, - allow_fill=allow_fill) + if is_extension_array_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + if is_sparse(arr): + arr = arr.get_values() + if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e618dc6b69b2d..cec881394a021 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -25,14 +25,13 @@ class ExtensionArray(object): * isna * take * copy - * _formatting_values * _concat_same_type - Some additional methods are required to satisfy pandas' internal, private + Some additional methods are available to satisfy pandas' internal, private block API. - * _concat_same_type * _can_hold_na + * _formatting_values This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -53,13 +52,14 @@ class ExtensionArray(object): Extension arrays should be able to be constructed with instances of the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. - - Additionally, certain methods and interfaces are required for proper - this array to be properly stored inside a ``DataFrame`` or ``Series``. """ + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. + # Don't override this. + _typ = 'extension' # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ + def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. @@ -92,7 +92,46 @@ def __getitem__(self, item): raise AbstractMethodError(self) def __setitem__(self, key, value): - # type: (Any, Any) -> None + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + This method is not required to satisfy the pandas extension array + interface. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + # Some notes to the ExtensionArray implementor who may have ended up + # here. While this method is not required for the interface, if you + # *do* choose to implement __setitem__, then some semantics should be + # observed: + # + # * Setting multiple values : ExtensionArrays should support setting + # multiple values at once, 'key' will be a sequence of integers and + # 'value' will be a same-length sequence. + # + # * Broadcasting : For a sequence 'key' and a scalar 'value', + # each position in 'key' should be set to 'value'. + # + # * Coercion : Most users will expect basic coercion to work. For + # example, a string like '2018-01-01' is coerced to a datetime + # when setting on a datetime64ns array. In general, if the + # __init__ method coerces that value, then so should __setitem__ raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') ) @@ -107,6 +146,16 @@ def __len__(self): # type: () -> int raise AbstractMethodError(self) + def __iter__(self): + """Iterate over elements of the array. + + """ + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. + for i in range(len(self)): + yield self[i] + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ @@ -132,9 +181,9 @@ def nbytes(self): # type: () -> int """The number of bytes needed to store this object in memory. - If this is expensive to compute, return an approximate lower bound - on the number of bytes needed. """ + # If this is expensive to compute, return an approximate lower bound + # on the number of bytes needed. raise AbstractMethodError(self) # ------------------------------------------------------------------------ @@ -184,8 +233,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. fill_value : any, default None - Fill value to replace -1 values with. By default, this uses - the missing value sentinel for this type, ``self._fill_value``. + Fill value to replace -1 values with. If applicable, this should + use the sentinel missing value for this type. Notes ----- @@ -198,17 +247,20 @@ def take(self, indexer, allow_fill=True, fill_value=None): Examples -------- - Suppose the extension array somehow backed by a NumPy structured array - and that the underlying structured array is stored as ``self.data``. - Then ``take`` may be written as + Suppose the extension array is backed by a NumPy array stored as + ``self.data``. Then ``take`` may be written as .. code-block:: python def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 result = self.data.take(indexer) - result[mask] = self._fill_value + result[mask] = np.nan # NA for this type return type(self)(result) + + See Also + -------- + numpy.take """ raise AbstractMethodError(self) @@ -230,17 +282,12 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Block-related methods # ------------------------------------------------------------------------ - @property - def _fill_value(self): - # type: () -> Any - """The missing value for this type, e.g. np.nan""" - return None def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype """An array of values to be printed in, e.g. the Series repr""" - raise AbstractMethodError(self) + return np.array(self) @classmethod def _concat_same_type(cls, to_concat): @@ -257,6 +304,7 @@ def _concat_same_type(cls, to_concat): """ raise AbstractMethodError(cls) + @property def _can_hold_na(self): # type: () -> bool """Whether your array can hold missing values. True by default. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c7c5378801f02..d54d980d02ffa 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,4 +1,7 @@ """Extend pandas with custom array types""" +import numpy as np + +from pandas import compat from pandas.errors import AbstractMethodError @@ -23,6 +26,32 @@ class ExtensionDtype(object): def __str__(self): return self.name + def __eq__(self, other): + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, compat.string_types): + return other == self.name + elif isinstance(other, type(self)): + return True + else: + return False + + def __ne__(self, other): + return not self.__eq__(other) + @property def type(self): # type: () -> type @@ -102,11 +131,12 @@ def construct_from_string(cls, string): @classmethod def is_dtype(cls, dtype): - """Check if we match 'dtype' + """Check if we match 'dtype'. Parameters ---------- - dtype : str or dtype + dtype : object + The object to check. Returns ------- @@ -118,12 +148,19 @@ def is_dtype(cls, dtype): 1. ``cls.construct_from_string(dtype)`` is an instance of ``cls``. - 2. 'dtype' is ``cls`` or a subclass of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. """ - if isinstance(dtype, str): - try: - return isinstance(cls.construct_from_string(dtype), cls) - except TypeError: - return False - else: - return issubclass(dtype, cls) + dtype = getattr(dtype, 'dtype', dtype) + + if isinstance(dtype, np.dtype): + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c2b71bc316fe8..197b35de88896 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1708,9 +1708,9 @@ def is_extension_array_dtype(arr_or_dtype): """ from pandas.core.arrays import ExtensionArray - # we want to unpack series, anything else? if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 99e4033f104db..d262a71933915 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -66,13 +66,6 @@ def __hash__(self): raise NotImplementedError("sub-classes should implement an __hash__ " "method") - def __eq__(self, other): - raise NotImplementedError("sub-classes should implement an __eq__ " - "method") - - def __ne__(self, other): - return not self.__eq__(other) - def __getstate__(self): # pickle support; we don't want to pickle the cache return {k: getattr(self, k, None) for k in self._metadata} @@ -82,24 +75,6 @@ def reset_cache(cls): """ clear the cache """ cls._cache = {} - @classmethod - def is_dtype(cls, dtype): - """ Return a boolean if the passed type is an actual dtype that - we can match (via string or type) - """ - if hasattr(dtype, 'dtype'): - dtype = dtype.dtype - if isinstance(dtype, np.dtype): - return False - elif dtype is None: - return False - elif isinstance(dtype, cls): - return True - try: - return cls.construct_from_string(dtype) is not None - except: - return False - class CategoricalDtypeType(type): """ diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index b032cb6f14d4c..cb54c94d29205 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -57,6 +57,8 @@ def _check(cls, inst): ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) +ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", + ("extension", "categorical",)) class _ABCGeneric(type): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ffac702476af1..01c88c269e7e0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -5,14 +5,16 @@ from pandas._libs import lib, missing as libmissing from pandas._libs.tslib import NaT, iNaT from .generic import (ABCMultiIndex, ABCSeries, - ABCIndexClass, ABCGeneric) + ABCIndexClass, ABCGeneric, + ABCExtensionArray) from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, - is_complex_dtype, is_categorical_dtype, + is_complex_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, + is_extension_array_dtype, needs_i8_conversion, _ensure_object, pandas_dtype, is_scalar, @@ -57,7 +59,8 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, + ABCExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) @@ -124,30 +127,31 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - values = getattr(obj, 'values', obj) dtype = values.dtype - if is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isna() - elif is_interval_dtype(values): - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() + if is_extension_array_dtype(obj): + if isinstance(obj, (ABCIndexClass, ABCSeries)): + values = obj._values else: + values = obj + result = values.isna() + elif is_interval_dtype(values): + # TODO(IntervalArray): remove this if block + from pandas import IntervalIndex + result = IntervalIndex(obj).isna() + elif is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj(values.ravel()) - result[...] = vec.reshape(shape) + if is_string_like_dtype(dtype): + # object array of strings + result = np.zeros(values.shape, dtype=bool) + else: + # object array of non-strings + result = np.empty(shape, dtype=bool) + vec = libmissing.isnaobj(values.ravel()) + result[...] = vec.reshape(shape) elif needs_i8_conversion(obj): # this is the NaT pattern @@ -406,4 +410,7 @@ def remove_na_arraylike(arr): """ Return array-like containing only true/non-NaN values, possibly empty. """ - return arr[notna(lib.values_from_object(arr))] + if is_extension_array_dtype(arr): + return arr[notna(arr)] + else: + return arr[notna(lib.values_from_object(arr))] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d81d22173bfbd..1b781be8fa2b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,6 +39,7 @@ is_categorical_dtype, is_object_dtype, is_extension_type, + is_extension_array_dtype, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, @@ -71,7 +72,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values), 1) return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif is_datetimetz(values): + elif (is_datetimetz(values) or is_extension_array_dtype(values)): # GH19157 if columns is None: columns = [0] @@ -2820,7 +2821,7 @@ def reindexer(value): # now align rows value = reindexer(value).T - elif isinstance(value, Categorical): + elif isinstance(value, ExtensionArray): value = value.copy() elif isinstance(value, Index) or is_sequence(value): @@ -2850,7 +2851,7 @@ def reindexer(value): value = maybe_cast_to_datetime(value, value.dtype) # return internal types directly - if is_extension_type(value): + if is_extension_type(value) or is_extension_array_dtype(value): return value # broadcast across multiple columns if necessary @@ -3387,12 +3388,8 @@ class max type new_obj = self.copy() def _maybe_casted_values(index, labels=None): - if isinstance(index, PeriodIndex): - values = index.astype(object).values - elif isinstance(index, DatetimeIndex) and index.tz is not None: - values = index - else: - values = index.values + values = index._values + if not isinstance(index, (PeriodIndex, DatetimeIndex)): if values.dtype == np.object_: values = lib.maybe_convert_objects(values) @@ -5604,7 +5601,9 @@ def count(self, axis=0, level=None, numeric_only=False): if len(frame._get_axis(axis)) == 0: result = Series(0, index=frame._get_agg_axis(axis)) else: - if frame._is_mixed_type: + if frame._is_mixed_type or frame._data.any_extension_types: + # the or any_extension_types is really only hit for single- + # column frames with an extension array result = notna(frame).sum(axis=axis) else: counts = notna(frame.values).sum(axis=axis) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7dfa34bd634ad..ef9facbacf490 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,6 +13,7 @@ from pandas import compat from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.generic import ( ABCSeries, ABCDataFrame, ABCMultiIndex, @@ -1982,6 +1983,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): if is_categorical_dtype(values.dtype): values = np.array(values) + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) @@ -2581,7 +2583,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and is_scalar(key): + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): try: return s[key] except (IndexError, ValueError): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 352ce921d1d44..50f3c7a6b3d3d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,6 +618,9 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dd5feefc49fe3..bad0626206e80 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,7 +56,11 @@ is_null_datelike_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex +from pandas.core.dtypes.generic import ( + ABCSeries, + ABCDatetimeIndex, + ABCExtensionArray, + ABCIndexClass) import pandas.core.common as com import pandas.core.algorithms as algos @@ -99,6 +103,7 @@ class Block(PandasObject): is_object = False is_categorical = False is_sparse = False + is_extension = False _box_to_block_values = True _can_hold_na = False _can_consolidate = True @@ -1854,11 +1859,40 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + is_extension = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super(ExtensionBlock, self).__init__(values, placement, ndim) + + def _maybe_coerce_values(self, values): + """Unbox to an extension array. + + This will unbox an ExtensionArray stored in an Index or Series. + ExtensionArrays pass through. No dtype coercion is done. + + Parameters + ---------- + values : Index, Series, ExtensionArray + + Returns + ------- + ExtensionArray + """ + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values + return values + @property def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def _can_hold_na(self): + # The default ExtensionArray._can_hold_na is True + return self._holder._can_hold_na + @property def is_view(self): """Extension arrays are never treated as views.""" @@ -3451,6 +3485,8 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, else: align_keys = [] + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. aligned_args = dict((k, kwargs[k]) for k in align_keys if hasattr(kwargs[k], 'values')) @@ -3696,6 +3732,11 @@ def is_datelike_mixed_type(self): self._consolidate_inplace() return any(block.is_datelike for block in self.blocks) + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + @property def is_view(self): """ return a boolean if we are a single block and are a view """ @@ -4101,7 +4142,10 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_extension_type = is_extension_type(value) + # TODO(EA): Remove an is_extension_ when all extension types satisfy + # the interface + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) # categorical/spares/datetimetz if value_is_extension_type: @@ -4833,15 +4877,11 @@ def form_blocks(arrays, names, axes): if len(items_dict['ExtensionBlock']): - external_blocks = [] - for i, _, array in items_dict['ExtensionBlock']: - if isinstance(array, ABCSeries): - array = array.values - # Allow our internal arrays to chose their block type. - block_type = getattr(array, '_block_type', ExtensionBlock) - external_blocks.append( - make_block(array, klass=block_type, - fastpath=True, placement=[i])) + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict['ExtensionBlock'] + ] + blocks.extend(external_blocks) if len(extra_locs): @@ -5162,7 +5202,7 @@ def _safe_reshape(arr, new_shape): """ if isinstance(arr, ABCSeries): arr = arr._values - if not isinstance(arr, Categorical): + if not isinstance(arr, ABCExtensionArray): arr = arr.reshape(new_shape) return arr @@ -5673,6 +5713,8 @@ def is_na(self): if not values._null_fill_value and values.sp_index.ngaps > 0: return False values_flat = values.ravel(order='K') + elif isinstance(self.block, ExtensionBlock): + values_flat = values else: values_flat = values.ravel(order='K') total_len = values_flat.shape[0] diff --git a/pandas/core/series.py b/pandas/core/series.py index 90dc14836ab55..12865bfe44a3b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,12 +14,14 @@ import numpy.ma as ma from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.common import ( is_categorical_dtype, is_bool, is_integer, is_integer_dtype, is_float_dtype, is_extension_type, + is_extension_array_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -173,12 +175,17 @@ def __init__(self, data=None, index=None, dtype=None, name=None, raise NotImplementedError("initializing a Series from a " "MultiIndex is not supported") elif isinstance(data, Index): - # need to copy to avoid aliasing issues if name is None: name = data.name - data = data._to_embed(keep_tz=True, dtype=dtype) + if dtype is not None: + # astype copies + data = data.astype(dtype) + else: + # need to copy to avoid aliasing issues + data = data._values.copy() copy = False + elif isinstance(data, np.ndarray): pass elif isinstance(data, Series): @@ -203,13 +210,15 @@ def __init__(self, data=None, index=None, dtype=None, name=None, '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') - elif isinstance(data, Categorical): + + elif is_extension_array_dtype(data) and dtype is not None: # GH12574: Allow dtype=category only, otherwise error - if ((dtype is not None) and - not is_categorical_dtype(dtype)): - raise ValueError("cannot specify a dtype with a " - "Categorical unless " - "dtype='category'") + if not data.dtype.is_dtype(dtype): + raise ValueError("Cannot specify a dtype '{}' with an " + "extension array of a different " + "dtype ('{}').".format(dtype, + data.dtype)) + elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) @@ -2556,8 +2565,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - # be subclass-friendly - new_values = algorithms.take_1d(self.get_values(), indexer) + new_values = algorithms.take_1d(self._values, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3113,10 +3121,11 @@ def _sanitize_index(data, index, copy=False): if isinstance(data, ABCIndexClass) and not copy: pass - elif isinstance(data, PeriodIndex): - data = data.astype(object).values - elif isinstance(data, DatetimeIndex): - data = data._to_embed(keep_tz=True) + elif isinstance(data, (PeriodIndex, DatetimeIndex)): + data = data._values + if copy: + data = data.copy() + elif isinstance(data, np.ndarray): # coerce datetimelike types @@ -3156,8 +3165,17 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, ordered=dtype.ordered) + elif is_extension_array_dtype(dtype): + # We don't allow casting to third party dtypes, since we don't + # know what array belongs to which type. + msg = ("Cannot cast data to extension dtype '{}'. " + "Pass the extension array directly.".format(dtype)) + raise ValueError(msg) + elif dtype is not None and raise_cast_failure: raise else: @@ -3189,9 +3207,15 @@ def _try_cast(arr, take_fast_path): # we will try to copy be-definition here subarr = _try_cast(data, True) - elif isinstance(data, Categorical): + elif isinstance(data, ExtensionArray): subarr = data + if dtype is not None and not data.dtype.is_dtype(dtype): + msg = ("Cannot coerce extension array to dtype '{typ}'. " + "Do the coercion before passing to the constructor " + "instead.".format(typ=dtype)) + raise ValueError(msg) + if copy: subarr = data.copy() return subarr diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 79758dee5cfda..c8ac6a6ef14f8 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- - import numpy as np import pandas.util.testing as tm -from pandas import (Categorical, Index, isna) +from pandas import Categorical, Index, isna from pandas.compat import lrange from pandas.core.dtypes.dtypes import CategoricalDtype diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py new file mode 100644 index 0000000000000..2273ef1f3e110 --- /dev/null +++ b/pandas/tests/extension/base/__init__.py @@ -0,0 +1,42 @@ +"""Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. +""" +from .casting import BaseCastingTests # noqa +from .constructors import BaseConstructorsTests # noqa +from .dtype import BaseDtypeTests # noqa +from .getitem import BaseGetitemTests # noqa +from .interface import BaseInterfaceTests # noqa +from .methods import BaseMethodsTests # noqa +from .missing import BaseMissingTests # noqa +from .reshaping import BaseReshapingTests # noqa diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py new file mode 100644 index 0000000000000..bcfbf0a247269 --- /dev/null +++ b/pandas/tests/extension/base/casting.py @@ -0,0 +1,11 @@ +import pandas as pd +from pandas.core.internals import ObjectBlock + + +class BaseCastingTests(object): + """Casting to and from ExtensionDtypes""" + + def test_astype_object_series(self, all_data): + ser = pd.Series({"A": all_data}) + result = ser.astype(object) + assert isinstance(result._data.blocks[0], ObjectBlock) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py new file mode 100644 index 0000000000000..7ad100e6289e9 --- /dev/null +++ b/pandas/tests/extension/base/constructors.py @@ -0,0 +1,43 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseConstructorsTests(object): + + def test_series_constructor(self, data): + result = pd.Series(data) + assert result.dtype == data.dtype + assert len(result) == len(data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + assert result._data.blocks[0].values is data + + # Series[EA] is unboxed / boxed correctly + result2 = pd.Series(result) + assert result2.dtype == data.dtype + assert isinstance(result2._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("from_series", [True, False]) + def test_dataframe_constructor_from_dict(self, data, from_series): + if from_series: + data = pd.Series(data) + result = pd.DataFrame({"A": data}) + assert result.dtypes['A'] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_dataframe_from_series(self, data): + result = pd.DataFrame(pd.Series(data)) + assert result.dtypes[0] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_mismatched_index_raises(self, data): + msg = 'Wrong number of items passed 3, placement implies 4' + with tm.assert_raises_regex(ValueError, None) as m: + pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + + assert m.match(msg) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py new file mode 100644 index 0000000000000..f5015bd469f13 --- /dev/null +++ b/pandas/tests/extension/base/dtype.py @@ -0,0 +1,46 @@ +import numpy as np +import pandas as pd + + +class BaseDtypeTests(object): + """Base class for ExtensionDtype classes""" + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set('biufcmMOSUV') + if dtype.kind is not None: + assert dtype.kind in valid + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + def test_is_not_string_type(self, dtype): + return not pd.api.types.is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + return not pd.api.types.is_object_dtype(dtype) + + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + '-suffix' + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype('object') diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py new file mode 100644 index 0000000000000..f43971e928cac --- /dev/null +++ b/pandas/tests/extension/base/getitem.py @@ -0,0 +1,119 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseGetitemTests(object): + """Tests for ExtensionArray.__getitem__.""" + + def test_iloc_series(self, data): + ser = pd.Series(data) + result = ser.iloc[:4] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + def test_loc_series(self, data): + ser = pd.Series(data) + result = ser.loc[:3] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ['A']] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ['A']] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) + + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) + + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + result = data_missing[0] + assert na_cmp(result, na_value) + + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + + def test_getitem_slice(self, data): + # getitem[slice] should return an array + result = data[slice(0)] # empty + assert isinstance(result, type(data)) + + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py new file mode 100644 index 0000000000000..8f17131a9482b --- /dev/null +++ b/pandas/tests/extension/base/interface.py @@ -0,0 +1,53 @@ +import numpy as np + +import pandas as pd +from pandas.compat import StringIO +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class BaseInterfaceTests(object): + """Tests that the basic interface is satisfied.""" + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + + def test_len(self, data): + assert len(data) == 100 + + def test_ndim(self, data): + assert data.ndim == 1 + + def test_can_hold_na_valid(self, data): + assert data._can_hold_na in {True, False} + + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + def test_as_ndarray_with_dtype_kind(self, data): + np.array(data, dtype=data.dtype.kind) + + def test_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result + + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py new file mode 100644 index 0000000000000..c77811ca63926 --- /dev/null +++ b/pandas/tests/extension/base/methods.py @@ -0,0 +1,32 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMethodsTests(object): + """Various Series and DataFrame methods.""" + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis='columns') + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py new file mode 100644 index 0000000000000..1d6f2eea1f1f9 --- /dev/null +++ b/pandas/tests/extension/base/missing.py @@ -0,0 +1,45 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMissingTests(object): + def test_isna(self, data_missing): + if data_missing._can_hold_na: + expected = np.array([True, False]) + else: + expected = np.array([False, False]) + + result = pd.isna(data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + tm.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis='columns') + expected = pd.DataFrame(index=[0, 1]) + tm.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, + "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py new file mode 100644 index 0000000000000..d8f577c6fa50d --- /dev/null +++ b/pandas/tests/extension/base/reshaping.py @@ -0,0 +1,61 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseReshapingTests(object): + """Tests for reshaping and concatenation.""" + @pytest.mark.parametrize('in_frame', [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + + assert len(result) == len(data) * 2 + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_align(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + tm.assert_frame_equal(r1, e1) + tm.assert_frame_equal(r2, e2) + + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df['B'] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({'A': data}) + df['B'] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/extension/category/__init__.py b/pandas/tests/extension/category/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py new file mode 100644 index 0000000000000..ec548fca6d901 --- /dev/null +++ b/pandas/tests/extension/category/test_categorical.py @@ -0,0 +1,84 @@ +import string + +import pytest +import numpy as np + +from pandas.api.types import CategoricalDtype +from pandas import Categorical +from pandas.tests.extension import base + + +def make_data(): + return np.random.choice(list(string.ascii_letters), size=100) + + +@pytest.fixture +def dtype(): + return CategoricalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return Categorical(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return Categorical([np.nan, 'A']) + + +@pytest.fixture +def na_value(): + return np.nan + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.skip(reason="Memory usage doesn't match") + def test_memory_usage(self): + # Is this deliberate? + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_align(self, data, na_value): + pass + + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_align_frame(self, data, na_value): + pass + + +class TestGetitem(base.BaseGetitemTests): + @pytest.mark.skip(reason="Backwards compatability") + def test_getitem_scalar(self): + # CategoricalDtype.type isn't "correct" since it should + # be a parent of the elements (object). But don't want + # to break things by changing. + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + pass + + @pytest.mark.skip(reason="Unobserved categories included") + def test_value_counts(self, all_data, dropna): + pass + + +class TestCasting(base.BaseCastingTests): + pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py new file mode 100644 index 0000000000000..f86849b9cbd61 --- /dev/null +++ b/pandas/tests/extension/conftest.py @@ -0,0 +1,48 @@ +import operator + +import pytest + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + raise NotImplementedError + + +@pytest.fixture +def data(): + """Length-100 array for this type.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + +@pytest.fixture +def na_cmp(): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By defult, uses ``operator.or`` + """ + return operator.is_ + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return None diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py new file mode 100644 index 0000000000000..f526ac5996a10 --- /dev/null +++ b/pandas/tests/extension/decimal/array.py @@ -0,0 +1,86 @@ +import decimal +import numbers +import random +import sys + +import numpy as np + +import pandas as pd +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype + + +class DecimalDtype(ExtensionDtype): + type = decimal.Decimal + name = 'decimal' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class DecimalArray(ExtensionArray): + dtype = DecimalDtype() + + def __init__(self, values): + values = np.asarray(values, dtype=object) + + self.values = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.values[item] + else: + return type(self)(self.values[item]) + + def copy(self, deep=False): + if deep: + return type(self)(self.values.copy()) + return type(self)(self) + + def __setitem__(self, key, value): + if pd.api.types.is_list_like(value): + value = [decimal.Decimal(v) for v in value] + else: + value = decimal.Decimal(value) + self.values[key] = value + + def __len__(self): + return len(self.values) + + def __repr__(self): + return repr(self.values) + + @property + def nbytes(self): + n = len(self) + if n: + return n * sys.getsizeof(self[0]) + return 0 + + def isna(self): + return np.array([x.is_nan() for x in self.values]) + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + + out = self.values.take(indexer) + out[mask] = self._na_value + + return type(self)(out) + + @property + def _na_value(self): + return decimal.Decimal('NaN') + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([x.values for x in to_concat])) + + +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py new file mode 100644 index 0000000000000..7b4d079ecad87 --- /dev/null +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -0,0 +1,154 @@ +import decimal + +import numpy as np +import pandas as pd +import pandas.util.testing as tm +import pytest + +from pandas.tests.extension import base + +from .array import DecimalDtype, DecimalArray, make_data + + +@pytest.fixture +def dtype(): + return DecimalDtype() + + +@pytest.fixture +def data(): + return DecimalArray(make_data()) + + +@pytest.fixture +def data_missing(): + return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) + + +@pytest.fixture +def na_cmp(): + return lambda x, y: x.is_nan() and y.is_nan() + + +@pytest.fixture +def na_value(): + return decimal.Decimal("NaN") + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + + def test_align(self, data, na_value): + # Have to override since assert_series_equal doesn't + # compare Decimal(NaN) properly. + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # NaN handling + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) + tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1[3].is_nan() + assert e1[3].is_nan() + + tm.assert_series_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2[0].is_nan() + assert e2[0].is_nan() + + def test_align_frame(self, data, na_value): + # Override for Decimal(NaN) comparison + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + + tm.assert_frame_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1.loc[3, 'A'].is_nan() + assert e1.loc[3, 'A'].is_nan() + + tm.assert_frame_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2.loc[0, 'A'].is_nan() + assert e2.loc[0, 'A'].is_nan() + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.xfail(reason="value_counts not implemented yet.") + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + +class TestCasting(base.BaseCastingTests): + pass + + +def test_series_constructor_coerce_data_to_extension_dtype_raises(): + xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly.") + with tm.assert_raises_regex(ValueError, xpr): + pd.Series([0, 1, 2], dtype=DecimalDtype()) + + +def test_series_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + result = pd.Series(arr, dtype=DecimalDtype()) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) + + +def test_series_constructor_coerce_extension_array_to_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + xpr = "Cannot specify a dtype 'int64' .* \('decimal'\)." + + with tm.assert_raises_regex(ValueError, xpr): + pd.Series(arr, dtype='int64') + + +def test_dataframe_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) + expected = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_constructor_with_different_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + xpr = "Cannot coerce extension array to dtype 'int64'. " + with tm.assert_raises_regex(ValueError, xpr): + pd.DataFrame({"A": arr}, dtype='int64') diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py new file mode 100644 index 0000000000000..90aac93c68f64 --- /dev/null +++ b/pandas/tests/extension/json/array.py @@ -0,0 +1,99 @@ +import collections +import itertools +import numbers +import random +import string +import sys + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.arrays import ExtensionArray + + +class JSONDtype(ExtensionDtype): + type = collections.Mapping + name = 'json' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class JSONArray(ExtensionArray): + dtype = JSONDtype() + + def __init__(self, values): + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError + self.data = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + return type(self)([x for x, m in zip(self, item) if m]) + else: + return type(self)(self.data[item]) + + def __setitem__(self, key, value): + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, (type(self), + collections.Sequence)): + # broadcast value + value = itertools.cycle([value]) + + if isinstance(key, np.ndarray) and key.dtype == 'bool': + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + + def __len__(self): + return len(self.data) + + def __repr__(self): + return 'JSONArary({!r})'.format(self.data) + + @property + def nbytes(self): + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == self._na_value for x in self.data]) + + def take(self, indexer, allow_fill=True, fill_value=None): + output = [self.data[loc] if loc != -1 else self._na_value + for loc in indexer] + return type(self)(output) + + def copy(self, deep=False): + return type(self)(self.data[:]) + + @property + def _na_value(self): + return {} + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable([x.data for x in to_concat])) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + return [collections.UserDict([ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10))]) for _ in range(100)] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py new file mode 100644 index 0000000000000..e0721bb1d8d1a --- /dev/null +++ b/pandas/tests/extension/json/test_json.py @@ -0,0 +1,73 @@ +import operator +import sys + +import pytest + + +from pandas.tests.extension import base + +from .array import JSONArray, JSONDtype, make_data + +pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, + reason="Py2 doesn't have a UserDict") + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return JSONArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {'a': 10}]) + + +@pytest.fixture +def na_value(): + return {} + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="Unhashable") + def test_value_counts(self, all_data, dropna): + pass + + +class TestCasting(base.BaseCastingTests): + pass diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/extension/test_external_block.py similarity index 94% rename from pandas/tests/internals/test_external_block.py rename to pandas/tests/extension/test_external_block.py index 2487363df8f99..991da41168aa0 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - BlockManager, SingleBlockManager, ExtensionBlock) + BlockManager, SingleBlockManager, NonConsolidatableMixIn, Block) import pytest -class CustomBlock(ExtensionBlock): +class CustomBlock(NonConsolidatableMixIn, Block): _holder = np.ndarray