diff --git a/.gitignore b/.gitignore index 0d4e8c6fb75a6..657234a985111 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,4 @@ doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ env/ +.mypy_cache diff --git a/doc/source/developer.rst b/doc/source/developer.rst index b8bb2b2fcbe2f..9d6d0b8c2ff5a 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -140,3 +140,64 @@ As an example of fully-formed metadata: 'metadata': None} ], 'pandas_version': '0.20.0'} + +.. _developer.custom-array-types: + +Custom Array Types +------------------ + +.. versionadded:: 0.23.0 + +.. warning:: + Support for custom array types is experimental. + +Sometimes the NumPy type system isn't rich enough for your needs. Pandas has +made a few extensions internally (e.g. ``Categorical``). While this has worked +well for pandas, not all custom data types belong in pandas itself. + +Pandas defines an interface for custom arrays. Arrays implementing this +interface will be stored correctly in ``Series`` or ``DataFrame``. The ABCs +that must be implemented are + +1. :class:`ExtensionDtype` A class describing your data type itself. This is + similar to a ``numpy.dtype``. +2. :class:`ExtensionArray`: A container for your data. + +Throughout this document, we'll use the example of storing IPv6 addresses. An +IPv6 address is 128 bits, so NumPy doesn't have a native data type for it. We'll +model it as a structured array with two ``uint64`` fields, which together +represent the 128-bit integer that is the IP Address. + +Extension Dtype +''''''''''''''' + +This class should describe your data type. The most important fields are +``name`` and ``base``: + +.. code-block:: python + + class IPv6Type(ExtensionDtype): + name = 'IPv6' + base = np.dtype([('hi', '>u8'), ('lo', '>u8')]) + type = IPTypeType + kind = 'O' + fill_value = np.array([(0, 0)], dtype=base) + +``base`` describe the underlying storage of individual items in your array. +TODO: is this true? Or does ``.base`` refer to the original memory this +is a view on? Different meanings for ``np.dtype.base`` vs. ``np.ndarray.base``? + +In our IPAddress case, we're using a NumPy structured array with two fields. + +Extension Array +''''''''''''''' + +This is the actual array container for your data, similar to a ``Categorical``, +and requires the most work to implement correctly. *pandas makes no assumptions +about how you store the data*. You're free to use NumPy arrays or PyArrow +arrays, or even just Python lists. That said, several of the methods required by +the interface expect NumPy arrays as the return value. + +* ``dtype``: Should be an *instance* of your custom ``ExtensionType`` +* ``formtting_values(self)``: Used for printing Series and DataFrame +* ``concat_same_type(concat)``: Used in :func:`pd.concat` diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index fcbf42f6dabc4..afff059e7b601 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1 +1,2 @@ """ public toolkit API """ +from . import types, extensions # noqa diff --git a/pandas/api/extensions.py b/pandas/api/extensions.py new file mode 100644 index 0000000000000..e9a9a2f41b50d --- /dev/null +++ b/pandas/api/extensions.py @@ -0,0 +1,4 @@ +from pandas.core.extensions import ( # noqa + ExtensionArray, + ExtensionDtype, +) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..ec993374139a2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,6 +6,7 @@ from warnings import warn, catch_warnings import numpy as np +from pandas.core.extensions import ExtensionArray from pandas.core.dtypes.cast import ( maybe_promote, construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( @@ -22,7 +23,7 @@ is_categorical, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, - is_scalar, is_list_like, + is_scalar, is_list_like, is_extension_type, _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) @@ -542,9 +543,12 @@ def value_counts(values, sort=True, ascending=False, normalize=False, else: - if is_categorical_dtype(values) or is_sparse(values): - - # handle Categorical and sparse, + if (is_extension_type(values) and not + is_datetime64tz_dtype(values)): + # Need the not is_datetime64tz_dtype since it's actually + # an ndarray. It doesn't have a `.values.value_counts`. + # Perhaps we need a new is_extension_type method that + # distinguishes these... result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values @@ -1323,6 +1327,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + elif isinstance(arr, ExtensionArray): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/base.py b/pandas/core/base.py index e90794c6c2e1a..3bcb51de69d32 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -880,7 +880,7 @@ def _map_values(self, mapper, na_action=None): if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index - if is_extension_type(self.dtype): + if is_extension_type(self): values = self._values else: values = self.values @@ -891,7 +891,8 @@ def _map_values(self, mapper, na_action=None): return new_values # we must convert to python types - if is_extension_type(self.dtype): + # TODO: is map part of the interface? + if is_extension_type(self) and hasattr(self._values, 'map'): values = self._values if na_action is not None: raise NotImplementedError diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 92fcdc0f4625b..6c26d382c0fb5 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -43,6 +43,7 @@ from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg from pandas.core.config import get_option +from pandas.core.extensions import ExtensionArray def _cat_compare_op(op): @@ -409,6 +410,11 @@ def dtype(self): """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype + @property + def _block_type(self): + from pandas.core.internals import CategoricalBlock + return CategoricalBlock + @property def _constructor(self): return Categorical @@ -2131,6 +2137,15 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + +# TODO: Categorical does not currently implement +# - concat_same_type +# - can_hold_na +# We don't need to implement these, since they're just for +# Block things, and we only use CategoricalBlocks for categoricals. +# We could move that logic from CategoricalBlock to Categorical, +# but holding off for now. +ExtensionArray.register(Categorical) # The Series.cat accessor diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b3ae8aae53b35..e1bce91fee624 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1089,6 +1089,7 @@ def find_common_type(types): numpy.find_common_type """ + # TODO: Make part of the interface? if len(types) == 0: raise ValueError('no types given') @@ -1100,7 +1101,8 @@ def find_common_type(types): if all(is_dtype_equal(first, t) for t in types[1:]): return first - if any(isinstance(t, ExtensionDtype) for t in types): + # TODO: Period is an ExtensionDtype + if any(isinstance(t, (ExtensionDtype, PeriodDtype)) for t in types): return np.object # take lowest unit diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5d6fc7487eeb5..bd3c2928d4dd0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -568,7 +568,6 @@ def is_string_dtype(arr_or_dtype): """ # TODO: gh-15585: consider making the checks stricter. - if arr_or_dtype is None: return False try: @@ -1624,11 +1623,13 @@ def is_bool_dtype(arr_or_dtype): def is_extension_type(arr): """ - Check whether an array-like is of a pandas extension class instance. + Check whether an array-like is a pandas extension class instance. Extension classes include categoricals, pandas sparse objects (i.e. classes represented within the pandas library and not ones external - to it like scipy sparse matrices), and datetime-like arrays. + to it like scipy sparse matrices), and datetime-like arrays with + timezones, or any third-party objects satisfying the pandas array + interface. Parameters ---------- @@ -1646,39 +1647,44 @@ def is_extension_type(arr): False >>> is_extension_type(np.array([1, 2, 3])) False - >>> + + Categoricals >>> cat = pd.Categorical([1, 2, 3]) - >>> >>> is_extension_type(cat) True >>> is_extension_type(pd.Series(cat)) - True + + pandas' Sparse arrays >>> is_extension_type(pd.SparseArray([1, 2, 3])) True >>> is_extension_type(pd.SparseSeries([1, 2, 3])) True - >>> >>> from scipy.sparse import bsr_matrix >>> is_extension_type(bsr_matrix([1, 2, 3])) False >>> is_extension_type(pd.DatetimeIndex([1, 2, 3])) False + + pandas' datetime with timezone >>> is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) True - >>> >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") >>> s = pd.Series([], dtype=dtype) >>> is_extension_type(s) True """ - - if is_categorical(arr): - return True - elif is_sparse(arr): - return True - elif is_datetimetz(arr): - return True - return False + # XXX: we have many places where we call this with a `.dtype`, + # instead of a type. Think about supporting that too... + from pandas.core.extensions import ExtensionArray, ExtensionDtype + return (isinstance(arr, ExtensionArray) or + isinstance(getattr(arr, 'values', None), ExtensionArray) or + # XXX: I don't like this getattr('dtype'), but I think it's + # necessary since DatetimeIndex().values of a datetime w/ tz + # is just a regular numpy array, and not an instance of + # ExtensionArray. I think that's since + # datetime (without tz) is *not* an extension type, but + # datetime[tz] *is* an extension type. + isinstance(getattr(arr, 'dtype', None), ExtensionDtype)) def is_complex_dtype(arr_or_dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2ec35889d6a7a..ec197a1519b7f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -4,27 +4,11 @@ import numpy as np from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex +from pandas.core.extensions import ExtensionDtype -class ExtensionDtype(object): - """ - A np.dtype duck-typed class, suitable for holding a custom dtype. - - THIS IS NOT A REAL NUMPY DTYPE - """ - name = None - names = None - type = None - subdtype = None - kind = None - str = None - num = 100 - shape = tuple() - itemsize = 8 - base = None - isbuiltin = 0 - isnative = 0 - _metadata = [] +class PandasExtensionMixin(object): + """Useful stuff that isn't in the interface""" _cache = {} def __unicode__(self): @@ -62,17 +46,6 @@ def __repr__(self): """ return str(self) - def __hash__(self): - raise NotImplementedError("sub-classes should implement an __hash__ " - "method") - - def __eq__(self, other): - raise NotImplementedError("sub-classes should implement an __eq__ " - "method") - - def __ne__(self, other): - return not self.__eq__(other) - def __getstate__(self): # pickle support; we don't want to pickle the cache return {k: getattr(self, k, None) for k in self._metadata} @@ -84,9 +57,6 @@ def reset_cache(cls): @classmethod def is_dtype(cls, dtype): - """ Return a boolean if the passed type is an actual dtype that - we can match (via string or type) - """ if hasattr(dtype, 'dtype'): dtype = dtype.dtype if isinstance(dtype, np.dtype): @@ -97,7 +67,7 @@ def is_dtype(cls, dtype): return True try: return cls.construct_from_string(dtype) is not None - except: + except TypeError: return False @@ -108,7 +78,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(ExtensionDtype): +class CategoricalDtype(PandasExtensionMixin, ExtensionDtype): """ Type for categorical data with the categories and orderedness @@ -387,7 +357,7 @@ class DatetimeTZDtypeType(type): pass -class DatetimeTZDtype(ExtensionDtype): +class DatetimeTZDtype(PandasExtensionMixin, ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom datetime with tz @@ -501,7 +471,7 @@ class PeriodDtypeType(type): pass -class PeriodDtype(ExtensionDtype): +class PeriodDtype(PandasExtensionMixin): __metaclass__ = PeriodDtypeType """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -516,6 +486,7 @@ class PeriodDtype(ExtensionDtype): _metadata = ['freq'] _match = re.compile(r"(P|p)eriod\[(?P.+)\]") _cache = {} + names = None # TODO inherit and remove def __new__(cls, freq=None): """ @@ -619,7 +590,7 @@ class IntervalDtypeType(type): pass -class IntervalDtype(ExtensionDtype): +class IntervalDtype(PandasExtensionMixin, ExtensionDtype): __metaclass__ = IntervalDtypeType """ A Interval duck-typed class, suitable for holding an interval diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ffac702476af1..ef0f7bd708091 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -18,9 +18,11 @@ is_scalar, is_object_dtype, is_integer, + is_extension_type, _TD_DTYPE, _NS_DTYPE) from .inference import is_list_like +from ..extensions import ExtensionArray isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar @@ -57,7 +59,8 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, + ExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) @@ -128,7 +131,9 @@ def _isna_ndarraylike(obj): values = getattr(obj, 'values', obj) dtype = values.dtype - if is_string_dtype(dtype): + if isinstance(values, ExtensionArray): + result = values.isna() + elif is_string_dtype(dtype): if is_categorical_dtype(values): from pandas import Categorical if not isinstance(values, Categorical): @@ -406,4 +411,7 @@ def remove_na_arraylike(arr): """ Return array-like containing only true/non-NaN values, possibly empty. """ - return arr[notna(lib.values_from_object(arr))] + if is_extension_type(arr): + return arr[notna(arr)] + else: + return arr[notna(lib.values_from_object(arr))] diff --git a/pandas/core/extensions.py b/pandas/core/extensions.py new file mode 100644 index 0000000000000..8ca4af5f2f0cc --- /dev/null +++ b/pandas/core/extensions.py @@ -0,0 +1,233 @@ +"""Extend pandas with custom array types. +""" +import abc +import typing as T # noqa + +import numpy as np # noqa + + +class ExtensionDtype(metaclass=abc.ABCMeta): + """A custom data type for your array. + """ + @property + def type(self): + # type: () -> T.Any + """Typically a metaclass inheriting from 'type' with no methods.""" + return type(self.name, (), {}) + + @property + def kind(self): + # type: () -> str + """A character code (one of 'biufcmMOSUV'), default 'O' + + See Also + -------- + numpy.dtype.kind + """ + return 'O' + + @property + @abc.abstractmethod + def name(self): + # type: () -> str + """An string identifying the data type. + + Will be used in, e.g. ``Series.dtype`` + """ + + @property + def names(self): + # type: () -> T.Optional[T.List[str]] + """Ordered list of field names, or None if there are no fields""" + return None + + @classmethod + def construct_from_string(cls, string): + # type: (str) -> ExtensionDtype + """Attempt to construct this type from a string. + + Parameters + ---------- + string : str + + Returns + ------- + self : instance of 'cls' + + Raises + ------ + TypeError + + Notes + ----- + The default implementation checks if 'string' matches your + type's name. If so, it calls your class with no arguments. + """ + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + @classmethod + def is_dtype(cls, dtype): + # type: (T.Union[str, type]) -> bool + """Check if we match 'dtype' + + Parameters + ---------- + dtype : str or dtype + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. 'dtype' is a string that returns true for + ``cls.construct_from_string`` + 2. 'dtype' is ``cls`` or a subclass of ``cls``. + """ + if isinstance(dtype, str): + try: + return isinstance(cls.construct_from_string(dtype), cls) + except TypeError: + return False + else: + return issubclass(dtype, cls) + + +class ExtensionArray(metaclass=abc.ABCMeta): + """Abstract base class for custom array types + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. + + Subclasses are expected to implement the following methods. + """ + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + @abc.abstractmethod + def __getitem__(self, item): + pass + + @abc.abstractmethod + def __iter__(self): + pass + + @abc.abstractmethod + def __len__(self): + pass + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + @property + @abc.abstractmethod + def dtype(self): + # type: () -> ExtensionDtype + pass + + @property + def shape(self): + # type: () -> T.Tuple[int, ...] + return (len(self),) + + @property + def ndim(self): + # type: () -> int + """Extension Arrays are only allowed to be 1-dimensional""" + return 1 + + @property + @abc.abstractmethod + def nbytes(self): + # type: () -> int + # TODO: default impl? + pass + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + @abc.abstractmethod + def isna(self): + # type: () -> T.Sequence[bool] + # TODO: narrow this type? + pass + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + @abc.abstractmethod + def take(self, indexer, allow_fill=True, fill_value=None): + # type: (T.Sequence, bool, T.Optional[T.Any]) -> ExtensionArray + """For slicing""" + + @abc.abstractmethod + def take_nd(self, indexer, allow_fill=True, fill_value=None): + """For slicing""" + # TODO: this isn't nescesary if we only allow 1D (though maybe + # impelment it). + + @abc.abstractmethod + def copy(self, deep=False): + # type: (bool) -> ExtensionArray + """For slicing""" + + # ------------------------------------------------------------------------ + # Block-related methods + # ------------------------------------------------------------------------ + @property + def fill_value(self): + # type: () -> T.Any + # TODO + return None + + @abc.abstractmethod + def formatting_values(self): + # type: () -> np.ndarray + # At the moment, this has to be an array since we use result.dtype + """An array of values to be printed in, e.g. the Series repr""" + + @classmethod + @abc.abstractmethod + def concat_same_type(cls, to_concat): + # type: (T.Sequence[ExtensionArray]) -> ExtensionArray + """Concatenate multiple array + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + cls + """ + + @abc.abstractmethod + def get_values(self): + # type: () -> np.ndarray + # TODO: What is the required return value? Sequence? ndarray?, ...? + # Categorical does an ndarray + """Get the underlying values backing your data + """ + pass + + @property + @abc.abstractmethod + def can_hold_na(self): + # type: () -> bool + pass + + @property + def is_sparse(self): + # type: () -> bool + return False + + def slice(self, slicer): + # TODO: is this right? + # In general, no. Probably just remove it? + return self.get_values()[slicer] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 43df2c48fcf58..4f2a41c96148b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -517,7 +517,7 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values), 1) return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif is_datetimetz(values): + elif is_extension_type(values): return self._init_dict({0: values}, index, columns, dtype=dtype) # by definition an array here @@ -3346,6 +3346,7 @@ class max type new_obj = self.copy() def _maybe_casted_values(index, labels=None): + # TODO: Handle extension index -> extension array if isinstance(index, PeriodIndex): values = index.astype(object).values elif isinstance(index, DatetimeIndex) and index.tz is not None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f634d809560ee..0eb7b3f0b8701 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -40,6 +40,7 @@ needs_i8_conversion, is_iterator, is_list_like, is_scalar) +from pandas.core.extensions import ExtensionArray from pandas.core.common import (is_bool_indexer, _values_from_object, _asarray_tuplesafe, _not_none, _index_labels_to_array) @@ -148,6 +149,8 @@ class Index(IndexOpsMixin, PandasObject): _inner_indexer = libjoin.inner_join_indexer_object _outer_indexer = libjoin.outer_join_indexer_object _box_scalars = False + # Whether items returned by self._data.__getitem__ need to be boxed + _box_slices = False _typ = 'index' _data = None @@ -1953,6 +1956,8 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): if is_categorical_dtype(values.dtype): values = np.array(values) + elif isinstance(values, ExtensionArray): + values = np.asarray(values._format_values()) elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0ee2f8ebce011..28e626648a2dd 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -287,7 +287,11 @@ def __getitem__(self, key): getitem = self._data.__getitem__ if is_int: val = getitem(key) - return self._box_func(val) + # XXX: Period will be boxed already, datetime won't be + if self._box_slices: + return self._box_func(val) + else: + return val else: if com.is_bool_indexer(key): key = np.asarray(key) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d83d2d2c93ec8..51abf22aae056 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -273,6 +273,7 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, _typ = 'datetimeindex' _join_precedence = 10 + _box_slices = True def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index baf80173d7362..71f30b635976f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -3,16 +3,13 @@ import numpy as np from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.generic import ABCDatetimeIndex, ABCPeriodIndex from pandas.core.dtypes.dtypes import IntervalDtype -from pandas.core.dtypes.cast import maybe_convert_platform, find_common_type +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_datetime_or_timedelta_dtype, is_datetime64tz_dtype, - is_categorical_dtype, - is_string_dtype, is_integer_dtype, is_float_dtype, is_interval_dtype, @@ -28,14 +25,13 @@ from pandas._libs import Timestamp, Timedelta from pandas._libs.interval import ( Interval, IntervalMixin, IntervalTree, - intervals_to_interval_bounds) +) from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex -from pandas.compat.numpy import function as nv from pandas.core.common import ( - _all_not_none, _any_none, _asarray_tuplesafe, _count_not_none, + _any_none, _asarray_tuplesafe, _count_not_none, is_bool_indexer, _maybe_box_datetimelike, _not_none) from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option @@ -43,6 +39,8 @@ from pandas.tseries.offsets import DateOffset import pandas.core.indexes.base as ibase +from pandas.core.interval import IntervalArray, ScalarDataError + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='IntervalIndex', @@ -95,30 +93,6 @@ def _get_interval_closed_bounds(interval): return left, right -def maybe_convert_platform_interval(values): - """ - Try to do platform conversion, with special casing for IntervalIndex. - Wrapper around maybe_convert_platform that alters the default return - dtype in certain cases to be compatible with IntervalIndex. For example, - empty lists return with integer dtype instead of object dtype, which is - prohibited for IntervalIndex. - - Parameters - ---------- - values : array-like - - Returns - ------- - array - """ - if isinstance(values, (list, tuple)) and len(values) == 0: - # GH 19016 - # empty lists/tuples get object dtype by default, but this is not - # prohibited for IntervalIndex, so coerce to integer instead - return np.array([], dtype=np.int64) - return maybe_convert_platform(values) - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -216,77 +190,30 @@ def __new__(cls, data, closed=None, name=None, copy=False, dtype=None, fastpath=False, verify_integrity=True): + # XXX: nail down verify_integrity. + # It should only ever be done on the Interval, yes? if fastpath: - return cls._simple_new(data.left, data.right, closed, name, - copy=copy, verify_integrity=False) + return cls._simple_new(data, name) if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, IntervalIndex): - left = data.left - right = data.right - closed = data.closed - else: - - # don't allow scalars - if is_scalar(data): - cls._scalar_data_error(data) - - data = maybe_convert_platform_interval(data) - left, right, infer_closed = intervals_to_interval_bounds(data) - - if _all_not_none(closed, infer_closed) and closed != infer_closed: - # GH 18421 - msg = ("conflicting values for closed: constructor got " - "'{closed}', inferred from data '{infer_closed}'" - .format(closed=closed, infer_closed=infer_closed)) - raise ValueError(msg) - - closed = closed or infer_closed + try: + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, + fastpath=fastpath, + verify_integrity=verify_integrity) + except ScalarDataError as e: + raise cls._scalar_data_error(data) from e - return cls._simple_new(left, right, closed, name, - copy=copy, verify_integrity=verify_integrity) + return cls._simple_new(array, name, verify_integrity=verify_integrity) @classmethod - def _simple_new(cls, left, right, closed=None, name=None, - copy=False, verify_integrity=True): + def _simple_new(cls, array, name, verify_integrity=True): result = IntervalMixin.__new__(cls) - - if closed is None: - closed = 'right' - left = _ensure_index(left, copy=copy) - right = _ensure_index(right, copy=copy) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ('must not have differing left [{ltype}] and right ' - '[{rtype}] types') - raise ValueError(msg.format(ltype=type(left).__name__, - rtype=type(right).__name__)) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalIndex') - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = 'Period dtypes are not supported, use a PeriodIndex instead' - raise ValueError(msg) - elif (isinstance(left, ABCDatetimeIndex) and - str(left.tz) != str(right.tz)): - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") - raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) - - result._left = left - result._right = right - result._closed = closed + result._data = array result.name = name + # XXX: check that we don't verify_integrity twice. Anywhere we do + # array = self._data._simple_new() will have already done it. if verify_integrity: result._validate() result._reset_identity() @@ -294,28 +221,10 @@ def _simple_new(cls, left, right, closed=None, name=None, @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): - if left is None: - - # no values passed - left, right = self.left, self.right - - elif right is None: - - # only single value passed, could be an IntervalIndex - # or array of Intervals - if not isinstance(left, IntervalIndex): - left = type(self).from_intervals(left) - - left, right = left.left, left.right - else: - - # both left and right are values - pass - + result = self._data._shallow_copy(left=left, right=right) attributes = self._get_attributes_dict() attributes.update(kwargs) - attributes['verify_integrity'] = False - return self._simple_new(left, right, **attributes) + return self._simple_new(result, name=self.name, verify_integrity=False) def _validate(self): """ @@ -435,10 +344,8 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - breaks = maybe_convert_platform_interval(breaks) - - return cls.from_arrays(breaks[:-1], breaks[1:], closed, - name=name, copy=copy) + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy) + return cls._simple_new(array, name) @classmethod def from_arrays(cls, left, right, closed='right', name=None, copy=False): @@ -476,11 +383,8 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - left = maybe_convert_platform_interval(left) - right = maybe_convert_platform_interval(right) - - return cls._simple_new(left, right, closed, name=name, - copy=copy, verify_integrity=True) + array = IntervalArray.from_arrays(left, right, closed, copy=copy) + return cls._simple_new(array, name, verify_integrity=True) @classmethod def from_intervals(cls, data, name=None, copy=False): @@ -521,13 +425,10 @@ def from_intervals(cls, data, name=None, copy=False): IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of tuples """ - if isinstance(data, IntervalIndex): - left, right, closed = data.left, data.right, data.closed - name = name or data.name - else: - data = maybe_convert_platform_interval(data) - left, right, closed = intervals_to_interval_bounds(data) - return cls.from_arrays(left, right, closed, name=name, copy=False) + arr = IntervalArray.from_intervals(data, copy=copy) + if name is None and isinstance(data, cls): + name = data.name + return cls._simple_new(arr, name=name) @classmethod def from_tuples(cls, data, closed='right', name=None, copy=False): @@ -562,24 +463,8 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): IntervalIndex.from_intervals : Construct an IntervalIndex from an array of Interval objects """ - if len(data): - left, right = [], [] - else: - left = right = data - - for d in data: - if isna(d): - lhs = rhs = np.nan - else: - lhs, rhs = d - left.append(lhs) - right.append(rhs) - - # TODO - # if we have nulls and we previous had *only* - # integer data, then we have changed the dtype - - return cls.from_arrays(left, right, closed, name=name, copy=False) + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy) + return cls._simple_new(arr, name=name) def to_tuples(self, na_tuple=True): """ @@ -601,6 +486,7 @@ def to_tuples(self, na_tuple=True): >>> idx.to_tuples(na_tuple=False) Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') """ + # TODO: Move to array? tuples = _asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: # GH 18756 @@ -618,7 +504,7 @@ def left(self): Return the left endpoints of each Interval in the IntervalIndex as an Index """ - return self._left + return self._data._left @property def right(self): @@ -626,7 +512,7 @@ def right(self): Return the right endpoints of each Interval in the IntervalIndex as an Index """ - return self._right + return self._data._right @property def closed(self): @@ -634,7 +520,7 @@ def closed(self): Whether the intervals are closed on the left-side, right-side, both or neither """ - return self._closed + return self._data._closed @property def length(self): @@ -662,7 +548,7 @@ def values(self): left = self.left right = self.right mask = self._isnan - closed = self._closed + closed = self.closed result = np.empty(len(left), dtype=object) for i in range(len(left)): @@ -691,11 +577,9 @@ def __reduce__(self): @Appender(_index_shared_docs['copy']) def copy(self, deep=False, name=None): - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right + array = self._data.copy(deep=deep) name = name if name is not None else self.name - closed = self.closed - return type(self).from_arrays(left, right, closed=closed, name=name) + return self._simple_new(array, name, verify_integrity=False) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): @@ -1165,33 +1049,9 @@ def _concat_same_dtype(self, to_concat, name): @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) - left, right = self.left, self.right - - if fill_value is None: - fill_value = self._na_value - mask = indices == -1 - - if not mask.any(): - # we won't change dtype here in this case - # if we don't need - allow_fill = False - - taker = lambda x: x.take(indices, allow_fill=allow_fill, - fill_value=fill_value) - - try: - new_left = taker(left) - new_right = taker(right) - except ValueError: - - # we need to coerce; migth have NA's in an - # integer dtype - new_left = taker(left.astype(float)) - new_right = taker(right.astype(float)) - - return self._shallow_copy(new_left, new_right) + result = self._data.take(indices, axis=axis, allow_fill=allow_fill, + fill_value=fill_value, **kwargs) + return self._simple_new(result, self.name, verify_integrity=False) def __getitem__(self, value): mask = self._isnan[value] diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8b35b1a231551..5272e2b9fb31d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,16 +9,14 @@ is_float, is_integer_dtype, is_float_dtype, - is_scalar, - is_datetime64_dtype, is_datetime64_any_dtype, is_timedelta64_dtype, is_period_dtype, is_bool_dtype, pandas_dtype, - _ensure_object) +) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.period import PeriodArray import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc @@ -28,11 +26,10 @@ from pandas.core.tools.datetimes import parse_time_string import pandas.tseries.offsets as offsets -from pandas._libs.lib import infer_dtype from pandas._libs import tslib, index as libindex from pandas._libs.tslibs.period import (Period, IncompatibleFrequency, get_period_field_arr, - _validate_end_alias, _quarter_to_myear) + _validate_end_alias) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs import resolution, period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds @@ -43,7 +40,7 @@ from pandas import compat from pandas.util._decorators import (Appender, Substitution, cache_readonly, deprecate_kwarg) -from pandas.compat import zip, u +from pandas.compat import u import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -59,16 +56,6 @@ def f(self): f.__name__ = name f.__doc__ = docstring return property(f) - - -def dt64arr_to_periodarr(data, freq, tz): - if data.dtype != np.dtype('M8[ns]'): - raise ValueError('Wrong dtype: %s' % data.dtype) - - freq = Period._maybe_convert_freq(freq) - base, mult = _gfc(freq) - return period.dt64arr_to_periodarr(data.view('i8'), base, tz) - # --- Period index sketch @@ -119,8 +106,10 @@ def wrapper(self, other): def _new_PeriodIndex(cls, **d): # GH13277 for unpickling - if d['data'].dtype == 'int64': + if d['data'].dtype == 'int64' or isinstance(d['data'], PeriodArray): values = d.pop('data') + elif isinstance(d['data'], PeriodArray): + values = d.pop('data')._data return cls._from_ordinals(values=values, **d) @@ -205,6 +194,7 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): TimedeltaIndex : Index of timedelta64 data """ _box_scalars = True + _box_slices = False _typ = 'periodindex' _attributes = ['name', 'freq'] @@ -239,131 +229,56 @@ def _add_comparison_methods(cls): def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, dtype=None, **kwargs): - - if periods is not None: - if is_float(periods): - periods = int(periods) - elif not is_integer(periods): - msg = 'periods must be a number, got {periods}' - raise TypeError(msg.format(periods=periods)) - + array = PeriodArray(data, ordinal=ordinal, freq=freq, start=start, + end=end, periods=periods, copy=copy, tz=tz, + dtype=dtype, **kwargs) if name is None and hasattr(data, 'name'): name = data.name - if dtype is not None: - dtype = pandas_dtype(dtype) - if not is_period_dtype(dtype): - raise ValueError('dtype must be PeriodDtype') - if freq is None: - freq = dtype.freq - elif freq != dtype.freq: - msg = 'specified freq and dtype are different' - raise IncompatibleFrequency(msg) - - # coerce freq to freq object, otherwise it can be coerced elementwise - # which is slow - if freq: - freq = Period._maybe_convert_freq(freq) - - if data is None: - if ordinal is not None: - data = np.asarray(ordinal, dtype=np.int64) - else: - data, freq = cls._generate_range(start, end, periods, - freq, kwargs) - return cls._from_ordinals(data, name=name, freq=freq) - - if isinstance(data, PeriodIndex): - if freq is None or freq == data.freq: # no freq change - freq = data.freq - data = data._values - else: - base1, _ = _gfc(data.freq) - base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, - base1, base2, 1) - return cls._simple_new(data, name=name, freq=freq) - - # not array / index - if not isinstance(data, (np.ndarray, PeriodIndex, - DatetimeIndex, Int64Index)): - if is_scalar(data) or isinstance(data, Period): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - - data = np.asarray(data) - - # datetime other than period - if is_datetime64_dtype(data.dtype): - data = dt64arr_to_periodarr(data, freq, tz) - return cls._from_ordinals(data, name=name, freq=freq) - - # check not floats - if infer_dtype(data) == 'floating' and len(data) > 0: - raise TypeError("PeriodIndex does not allow " - "floating point in construction") - - # anything else, likely an array of strings or periods - data = _ensure_object(data) - freq = freq or period.extract_freq(data) - data = period.extract_ordinals(data, freq) - return cls._from_ordinals(data, name=name, freq=freq) + return cls._from_period_array(array, name=name) @cache_readonly def _engine(self): return self._engine_type(lambda: self, len(self)) - @classmethod - def _generate_range(cls, start, end, periods, freq, fields): - if freq is not None: - freq = Period._maybe_convert_freq(freq) - - field_count = len(fields) - if com._count_not_none(start, end) > 0: - if field_count > 0: - raise ValueError('Can either instantiate from fields ' - 'or endpoints, but not both') - subarr, freq = _get_ordinal_range(start, end, periods, freq) - elif field_count > 0: - subarr, freq = _range_from_fields(freq=freq, **fields) - else: - raise ValueError('Not enough parameters to construct ' - 'Period range') - - return subarr, freq - @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): """ Values can be any type that can be coerced to Periods. Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ - if not is_integer_dtype(values): - values = np.array(values, copy=False) - if len(values) > 0 and is_float_dtype(values): - raise TypeError("PeriodIndex can't take floats") - return cls(values, name=name, freq=freq, **kwargs) + # Kept for compatability with other indexes + if not isinstance(values, PeriodArray): + if not is_integer_dtype(values): + values = np.array(values, copy=False) + if len(values) > 0 and is_float_dtype(values): + raise TypeError("PeriodIndex can't take floats") + + return cls(values, name=name, freq=freq, **kwargs) + else: + return cls._from_ordinals(values, name, freq) - return cls._from_ordinals(values, name, freq, **kwargs) + return cls._from_period_array(values, name, freq) @classmethod - def _from_ordinals(cls, values, name=None, freq=None, **kwargs): + def _from_ordinals(cls, values, name, freq): + array = PeriodArray._from_ordinals(values, freq=freq) + return cls._from_period_array(array, name=name, freq=freq) + + @classmethod + def _from_period_array(cls, values, name=None, freq=None): """ Values should be int ordinals `__new__` & `_simple_new` cooerce to ordinals and call this method """ - - values = np.array(values, dtype='int64', copy=False) + if freq and values.freq != freq: + # may have to cast here. + values = values.asfreq(freq) result = object.__new__(cls) result._data = values result.name = name - if freq is None: - raise ValueError('freq is not specified and cannot be inferred') - result.freq = Period._maybe_convert_freq(freq) + result.freq = values.freq result._reset_identity() return result @@ -375,7 +290,7 @@ def _shallow_copy(self, values=None, freq=None, **kwargs): if freq is None: freq = self.freq if values is None: - values = self._values + values = self._data return super(PeriodIndex, self)._shallow_copy(values=values, freq=freq, **kwargs) @@ -408,7 +323,7 @@ def __contains__(self, key): @property def asi8(self): - return self._values.view('i8') + return self._data.asi8 @cache_readonly def _int64index(self): @@ -416,11 +331,13 @@ def _int64index(self): @property def values(self): + """An object array of Periods with our 'freq'.""" return self.astype(object).values @property def _values(self): - return self._data + """The ordinal integers.""" + return self._data._data def __array__(self, dtype=None): if is_integer_dtype(dtype): @@ -581,27 +498,8 @@ def asfreq(self, freq=None, how='E'): [2010-01, ..., 2015-01] Length: 6, Freq: M """ - how = _validate_end_alias(how) - - freq = Period._maybe_convert_freq(freq) - - base1, mult1 = _gfc(self.freq) - base2, mult2 = _gfc(freq) - - asi8 = self.asi8 - # mult1 can't be negative or 0 - end = how == 'E' - if end: - ordinal = asi8 + mult1 - 1 - else: - ordinal = asi8 - - new_data = period.period_asfreq_arr(ordinal, base1, base2, end) - - if self.hasnans: - new_data[self._isnan] = tslib.iNaT - - return self._simple_new(new_data, self.name, freq=freq) + values = self._data.asfreq(freq, how=how) + return self._from_period_array(values, name=self.name) year = _field_accessor('year', 0, "The year of the period") month = _field_accessor('month', 3, "The month as January=1, December=12") @@ -1097,102 +995,6 @@ def tz_localize(self, tz, infer_dst=False): PeriodIndex._add_datetimelike_methods() -def _get_ordinal_range(start, end, periods, freq, mult=1): - if com._count_not_none(start, end, periods) != 2: - raise ValueError('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') - - if freq is not None: - _, mult = _gfc(freq) - - if start is not None: - start = Period(start, freq) - if end is not None: - end = Period(end, freq) - - is_start_per = isinstance(start, Period) - is_end_per = isinstance(end, Period) - - if is_start_per and is_end_per and start.freq != end.freq: - raise ValueError('start and end must have same freq') - if (start is tslib.NaT or end is tslib.NaT): - raise ValueError('start and end must not be NaT') - - if freq is None: - if is_start_per: - freq = start.freq - elif is_end_per: - freq = end.freq - else: # pragma: no cover - raise ValueError('Could not infer freq from start/end') - - if periods is not None: - periods = periods * mult - if start is None: - data = np.arange(end.ordinal - periods + mult, - end.ordinal + 1, mult, - dtype=np.int64) - else: - data = np.arange(start.ordinal, start.ordinal + periods, mult, - dtype=np.int64) - else: - data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) - - return data, freq - - -def _range_from_fields(year=None, month=None, quarter=None, day=None, - hour=None, minute=None, second=None, freq=None): - if hour is None: - hour = 0 - if minute is None: - minute = 0 - if second is None: - second = 0 - if day is None: - day = 1 - - ordinals = [] - - if quarter is not None: - if freq is None: - freq = 'Q' - base = frequencies.FreqGroup.FR_QTR - else: - base, mult = _gfc(freq) - if base != frequencies.FreqGroup.FR_QTR: - raise AssertionError("base must equal FR_QTR") - - year, quarter = _make_field_arrays(year, quarter) - for y, q in zip(year, quarter): - y, m = _quarter_to_myear(y, q, freq) - val = period.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) - ordinals.append(val) - else: - base, mult = _gfc(freq) - arrays = _make_field_arrays(year, month, day, hour, minute, second) - for y, mth, d, h, mn, s in zip(*arrays): - ordinals.append(period.period_ordinal( - y, mth, d, h, mn, s, 0, 0, base)) - - return np.array(ordinals, dtype=np.int64), freq - - -def _make_field_arrays(*fields): - length = None - for x in fields: - if isinstance(x, (list, np.ndarray, ABCSeries)): - if length is not None and len(x) != length: - raise ValueError('Mismatched Period array lengths') - elif length is None: - length = len(x) - - arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) - else np.repeat(x, length) for x in fields] - - return arrays - - def pnow(freq=None): # deprecation, xref #13790 warnings.warn("pd.pnow() and pandas.core.indexes.period.pnow() " diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 12ca26cfe0266..158334de9e1df 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -161,6 +161,7 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, TimelikeOps, Int64Index): _typ = 'timedeltaindex' _join_precedence = 10 + _box_slices = True def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper( diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5a4778ae4e629..c6bf09dbd0055 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -13,8 +13,9 @@ from pandas.core.base import PandasObject +from pandas.core.extensions import ExtensionDtype, ExtensionArray from pandas.core.dtypes.dtypes import ( - ExtensionDtype, DatetimeTZDtype, + DatetimeTZDtype, CategoricalDtype) from pandas.core.dtypes.common import ( _TD_DTYPE, _NS_DTYPE, @@ -55,6 +56,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex from pandas.core.common import is_null_slice, _any_not_none + import pandas.core.algorithms as algos from pandas.core.index import Index, MultiIndex, _ensure_index @@ -76,7 +78,169 @@ from pandas.compat import range, map, zip, u -class Block(PandasObject): +class BlockOpsMixin(object): + """Operations that should work on regular or extension blocks + + These methods should only use attributes that are part of the interface. + """ + + def __init__(self, values, placement, ndim=None, fastpath=None): + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + + # kludgetastic + if ndim is None: + if len(self.mgr_locs) != 1: + ndim = 1 + else: + ndim = 2 + self.ndim = ndim + + if not isinstance(values, self._holder): + raise TypeError("values must be {0}".format(self._holder.__name__)) + + self.values = values + + @property + def mgr_locs(self): + # TODO: check perf for base... + return self._mgr_locs + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs + + def copy(self, deep=False, mgr=None): + values = self.values + if deep: + values = values.copy() + return self.make_block_same_class(values) + + def getitem_block(self, slicer, new_mgr_locs=None): + """ + Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + """ + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if self._validate_ndim and new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") + + return self.make_block_same_class(new_values, new_mgr_locs) + + def apply(self, func, mgr=None, **kwargs): + """ apply the function to my values; return a block if we are not + one + """ + from pandas.core.internals import Block, _block_shape + + with np.errstate(all='ignore'): + result = func(self.values, **kwargs) + if not isinstance(result, Block): + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) + + return result + + def make_block(self, values, placement=None, ndim=None, **kwargs): + """ + Create a new block, with type inference propagate any values that are + not specified + """ + from pandas.core.internals import make_block + + if placement is None: + placement = self.mgr_locs + if ndim is None: + ndim = self.ndim + + return make_block(values, placement=placement, ndim=ndim, **kwargs) + + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + + """ + + # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock + # so need to preserve types + # sparse is treated like an ndarray, but needs .get_values() shaping + import pandas.core.algorithms as algos + from pandas.core.dtypes.common import is_dtype_equal + + values = self.values + if self.is_sparse: + values = self.get_values() + + if fill_tuple is None: + fill_value = self.fill_value + new_values = algos.take_nd(values, indexer, axis=axis, + allow_fill=False) + else: + fill_value = fill_tuple[0] + new_values = algos.take_nd(values, indexer, axis=axis, + allow_fill=True, fill_value=fill_value) + + if new_mgr_locs is None: + if axis == 0: + slc = lib.indexer_as_slice(indexer) + if slc is not None: + new_mgr_locs = self.mgr_locs[slc] + else: + new_mgr_locs = self.mgr_locs[indexer] + else: + new_mgr_locs = self.mgr_locs + + if not is_dtype_equal(new_values.dtype, self.dtype): + return self.make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) + + @property + def is_sparse(self): + return False + + def ftype(self): + return self.is_sparse + + def ftypes(self): + return self.is_sparse + + def __len__(self): + return len(self.values) + + @property + def fill_value(self): + return self._holder.fill_value + + def make_block_same_class(self, values, placement=None, fastpath=True, + **kwargs): + """ Wrap given values in a block of same type as self. """ + from pandas.core.internals import make_block + + if placement is None: + placement = self.mgr_locs + return make_block(values, placement=placement, klass=self.__class__, + fastpath=fastpath, **kwargs) + + def _try_coerce_result(self, result): + return result + + +class Block(BlockOpsMixin, PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure @@ -1689,140 +1853,6 @@ def __len__(self): return 0 -class NonConsolidatableMixIn(object): - """ hold methods for the nonconsolidatable blocks """ - _can_consolidate = False - _verify_integrity = False - _validate_ndim = False - _holder = None - - def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs): - - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement - - # kludgetastic - if ndim is None: - if len(self.mgr_locs) != 1: - ndim = 1 - else: - ndim = 2 - self.ndim = ndim - - if not isinstance(values, self._holder): - raise TypeError("values must be {0}".format(self._holder.__name__)) - - self.values = values - - @property - def shape(self): - if self.ndim == 1: - return (len(self.values)), - return (len(self.mgr_locs), len(self.values)) - - def get_values(self, dtype=None): - """ need to to_dense myself (and always return a ndim sized object) """ - values = self.values.to_dense() - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values - - def iget(self, col): - - if self.ndim == 2 and isinstance(col, tuple): - col, loc = col - if not is_null_slice(col) and col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values[loc] - else: - if col != 0: - raise IndexError("{0} only contains one item".format(self)) - return self.values - - def should_store(self, value): - return isinstance(value, self._holder) - - def set(self, locs, values, check=False): - assert locs.tolist() == [0] - self.values = values - - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False, mgr=None): - """ - putmask the data to the block; we must be a single block and not - generate other blocks - - return the resulting block - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - - Returns - ------- - a new block(s), the result of the putmask - """ - inplace = validate_bool_kwarg(inplace, 'inplace') - - # use block's copy logic. - # .values may be an Index which does shallow copy by default - new_values = self.values if inplace else self.copy().values - new_values, _, new, _ = self._try_coerce_args(new_values, new) - - if isinstance(new, np.ndarray) and len(new) == len(mask): - new = new[mask] - - mask = _safe_reshape(mask, new_values.shape) - - new_values[mask] = new - new_values = self._try_coerce_result(new_values) - return [self.make_block(values=new_values)] - - def _slice(self, slicer): - """ return a slice of my values (but densify first) """ - return self.get_values()[slicer] - - def _try_cast_result(self, result, dtype=None): - return result - - def _unstack(self, unstacker_func, new_columns): - """Return a list of unstacked blocks of self - - Parameters - ---------- - unstacker_func : callable - Partially applied unstacker. - new_columns : Index - All columns of the unstacked BlockManager. - - Returns - ------- - blocks : list of Block - New blocks of unstacked values. - mask : array_like of bool - The mask of columns of `blocks` we should keep. - """ - # NonConsolidatable blocks can have a single item only, so we return - # one block per item - unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - - mask = mask.any(0) - new_values = new_values.T[mask] - new_placement = new_placement[mask] - - blocks = [self.make_block_same_class(vals, [place]) - for vals, place in zip(new_values, new_placement)] - return blocks, mask - - class NumericBlock(Block): __slots__ = () is_numeric = True @@ -2330,6 +2360,143 @@ def re_replacer(s): return block +class NonConsolidatableMixIn(object): + """ hold methods for the nonconsolidatable blocks """ + _can_consolidate = False + _verify_integrity = False + _validate_ndim = False + _holder = None + + def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs): + + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + + # kludgetastic + if ndim is None: + if len(self.mgr_locs) != 1: + ndim = 1 + else: + ndim = 2 + self.ndim = ndim + + if not isinstance(values, self._holder): + raise TypeError("values must be {0}".format(self._holder.__name__)) + + self.values = values + + @property + def shape(self): + if self.ndim == 1: + return (len(self.values)), + return (len(self.mgr_locs), len(self.values)) + + def get_values(self, dtype=None): + """ need to to_dense myself (and always return a ndim sized object) """ + values = self.values.to_dense() + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def iget(self, col): + from pandas.core.common import is_null_slice + + if self.ndim == 2 and isinstance(col, tuple): + col, loc = col + if not is_null_slice(col) and col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + else: + if col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values + + def should_store(self, value): + return isinstance(value, self._holder) + + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values + + def putmask(self, mask, new, align=True, inplace=False, axis=0, + transpose=False, mgr=None): + """ + putmask the data to the block; we must be a single block and not + generate other blocks + + return the resulting block + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ + from pandas.util._validators import validate_bool_kwarg + + inplace = validate_bool_kwarg(inplace, 'inplace') + + # use block's copy logic. + # .values may be an Index which does shallow copy by default + new_values = self.values if inplace else self.copy().values + new_values, _, new, _ = self._try_coerce_args(new_values, new) + + if isinstance(new, np.ndarray) and len(new) == len(mask): + new = new[mask] + + mask = _safe_reshape(mask, new_values.shape) + + new_values[mask] = new + new_values = self._try_coerce_result(new_values) + return [self.make_block(values=new_values)] + + def _slice(self, slicer): + """ return a slice of my values (but densify first) """ + return self.get_values()[slicer] + + def _try_cast_result(self, result, dtype=None): + return result + + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + # NonConsolidatable blocks can have a single item only, so we return + # one block per item + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [self.make_block_same_class(vals, [place]) + for vals, place in zip(new_values, new_placement)] + return blocks, mask + + class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): __slots__ = () is_categorical = True @@ -2920,7 +3087,12 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, dtype = dtype or values.dtype vtype = dtype.type - if isinstance(values, SparseArray): + # TODO: cleanup + if isinstance(values, ExtensionArray): + # Our classes may implement a custom block type. 3rd + # party classes are currently stuck with ExtensionBlock + klass = getattr(values, '_block_type', ExtensionBlock) + elif isinstance(values, SparseArray): klass = SparseBlock elif issubclass(vtype, np.floating): klass = FloatBlock @@ -2953,6 +3125,76 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, # TODO: flexible with index=None and/or items=None +class ExtensionBlock(BlockOpsMixin, NonConsolidatableMixIn): + """Extend pandas internal storage mechanism. + + This is *not* part of the interface. It's intended to hide Blocks from + third-party libraries. + """ + _box_to_block_values = False + + def __init__(self, values: 'ExtensionArray', placement, + ndim=None, fastpath=False) -> None: + self._holder = type(values) + super().__init__(values, placement, ndim=ndim, fastpath=fastpath) + + def formatting_values(self) -> np.ndarray: + """An array of values for printing""" + return self.values.formatting_values() + + def concat_same_type(self, to_concat, placement=None): + values = self._holder.concat_same_type([ + blk.values for blk in to_concat + ]) + return self.make_block_same_class( + values, placement=placement or slice(0, len(values), 1) + ) + + def get_values(self, dtype=None): + return self.values.get_values() + + @property + def dtype(self): + """The custom type for your array""" + return self.values.dtype + + def to_dense(self): + """The array backing your data.""" + return self.values.get_values() + + @property + def _can_hold_na(self) -> bool: + """Boolean indicating whether your container holds missing values""" + return self.values.can_hold_na + + def internal_values(self): + """ return an internal format, currently just the ndarray + this should be the pure internal API format + """ + return self.values + + def external_values(self): + return self.values + + @property + def is_sparse(self): + return False + + def ftype(self): + return self.is_sparse + + def ftypes(self): + return self.is_sparse + + @property + def fill_value(self): + return self._holder.fill_value + + def _slice(self, slicer): + """ return a slice of my values (but densify first) """ + return self.values.slice(slicer) + + class BlockManager(PandasObject): """ Core internal data structure to implement DataFrame, Series, Panel, etc. @@ -4410,7 +4652,7 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=False): 'more than 1 block') block = block[0] - if not isinstance(block, Block): + if not isinstance(block, (Block, ExtensionBlock)): block = make_block(block, placement=slice(0, len(axis)), ndim=1, fastpath=True) @@ -4667,6 +4909,7 @@ def form_blocks(arrays, names, axes): datetime_items = [] datetime_tz_items = [] cat_items = [] + external_items = [] extra_locs = [] names_idx = Index(names) @@ -4704,6 +4947,8 @@ def form_blocks(arrays, names, axes): bool_items.append((i, k, v)) elif is_categorical(v): cat_items.append((i, k, v)) + elif is_extension_type(v): + external_items.append((i, k, v)) else: object_items.append((i, k, v)) @@ -4750,6 +4995,18 @@ def form_blocks(arrays, names, axes): for i, _, array in cat_items] blocks.extend(cat_blocks) + if len(external_items): + external_blocks = [] + for i, _, array in external_items: + if isinstance(array, ABCSeries): + array = array.values + block_type = getattr(array, '_block_type', ExtensionBlock) + external_blocks.append( + make_block(array, klass=block_type, + fastpath=True, placement=[i]) + ) + blocks.extend(external_blocks) + if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) @@ -5052,28 +5309,6 @@ def rrenamer(x): _transform_index(right, rrenamer)) -def _safe_reshape(arr, new_shape): - """ - If possible, reshape `arr` to have shape `new_shape`, - with a couple of exceptions (see gh-13012): - - 1) If `arr` is a Categorical or Index, `arr` will be - returned as is. - 2) If `arr` is a Series, the `_values` attribute will - be reshaped and returned. - - Parameters - ---------- - arr : array-like, object to be reshaped - new_shape : int or tuple of ints, the new shape - """ - if isinstance(arr, ABCSeries): - arr = arr._values - if not isinstance(arr, Categorical): - arr = arr.reshape(new_shape) - return arr - - def _transform_index(index, func, level=None): """ Apply function to all values found in index. @@ -5271,6 +5506,7 @@ def get_empty_dtype_and_na(join_units): if dtype is None: continue + # TODO: simplify if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetimetz(dtype): @@ -5671,3 +5907,28 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): if not allow_fill: indexer = maybe_convert_indices(indexer, length) return 'fancy', indexer, len(indexer) + + +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a Categorical or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + from pandas.core.dtypes.generic import ABCSeries + from pandas.core.categorical import Categorical + + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, Categorical): + arr = arr.reshape(new_shape) + return arr diff --git a/pandas/core/interval.py b/pandas/core/interval.py new file mode 100644 index 0000000000000..e5ce62690beac --- /dev/null +++ b/pandas/core/interval.py @@ -0,0 +1,549 @@ +import numpy as np + +from pandas._libs.interval import (Interval, IntervalMixin, + intervals_to_interval_bounds) +from pandas.compat.numpy import function as nv +from pandas.core.common import _all_not_none +from pandas.core.config import get_option +from pandas.core.dtypes.cast import maybe_convert_platform +from pandas.core.dtypes.common import (_ensure_platform_int, + is_categorical_dtype, is_float_dtype, + is_integer_dtype, is_interval_dtype, + is_scalar, is_string_dtype) +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, + ABCSeries) +from pandas.core.dtypes.missing import isna, notna +from pandas.core.extensions import ExtensionArray +from pandas.core.indexes.base import Index, _ensure_index + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + + +class ScalarDataError(TypeError): + # XXX: this is a "hack" to get the right class name in the error + # message. + pass + + +class IntervalArray(IntervalMixin, ExtensionArray): + dtype = IntervalDtype() + ndim = 1 + can_hold_na = True + _na_value = fill_value = np.nan + + def __new__(cls, data, closed=None, copy=False, dtype=None, + fastpath=False, verify_integrity=True): + + from pandas.core.indexes.interval import IntervalIndex + + if fastpath: + return cls._simple_new(data.left, data.right, closed, + copy=copy, verify_integrity=False) + + if isinstance(data, ABCSeries) and is_interval_dtype(data): + data = data.values + if isinstance(data, (cls, IntervalIndex)): + left = data.left + right = data.right + closed = data.closed + else: + + # don't allow scalars + if is_scalar(data): + cls._scalar_data_error(data) + + data = maybe_convert_platform_interval(data) + left, right, infer_closed = intervals_to_interval_bounds(data) + + if _all_not_none(closed, infer_closed) and closed != infer_closed: + # GH 18421 + msg = ("conflicting values for closed: constructor got " + "'{closed}', inferred from data '{infer_closed}'" + .format(closed=closed, infer_closed=infer_closed)) + raise ValueError(msg) + + closed = closed or infer_closed + + return cls._simple_new(left, right, closed, + copy=copy, verify_integrity=verify_integrity) + + @classmethod + def _simple_new(cls, left, right, closed=None, + copy=False, verify_integrity=True): + result = IntervalMixin.__new__(cls) + + if closed is None: + closed = 'right' + left = _ensure_index(left, copy=copy) + right = _ensure_index(right, copy=copy) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ('must not have differing left [{ltype}] and right ' + '[{rtype}] types') + raise ValueError(msg.format(ltype=type(left).__name__, + rtype=type(right).__name__)) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ('category, object, and string subtypes are not supported ' + 'for IntervalIndex') + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = 'Period dtypes are not supported, use a PeriodIndex instead' + raise ValueError(msg) + elif (isinstance(left, ABCDatetimeIndex) and + str(left.tz) != str(right.tz)): + msg = ("left and right must have the same time zone, got " + "'{left_tz}' and '{right_tz}'") + raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) + + result._left = left + result._right = right + result._closed = closed + if verify_integrity: + result._validate() + return result + + @classmethod + def from_breaks(cls, breaks, closed='right', copy=False): + """ + Construct an IntervalIndex from an array of splits + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + copy the data + + Examples + -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples + """ + breaks = maybe_convert_platform_interval(breaks) + + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy) + + @classmethod + def from_arrays(cls, left, right, closed='right', copy=False): + """ + Construct an IntervalIndex from a a left and right array + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + copy the data + + Examples + -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples + """ + left = maybe_convert_platform_interval(left) + right = maybe_convert_platform_interval(right) + + return cls._simple_new(left, right, closed, copy=copy, + verify_integrity=True) + + @classmethod + def from_intervals(cls, data, copy=False): + """ + Construct an IntervalIndex from a 1d array of Interval objects + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + >>> pd.IntervalIndex.from_intervals([pd.Interval(0, 1), + ... pd.Interval(1, 2)]) + IntervalIndex([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) + IntervalIndex([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples + """ + from pandas.core.indexes.interval import IntervalIndex + + if isinstance(data, (cls, IntervalIndex)): + left, right, closed = data.left, data.right, data.closed + else: + data = maybe_convert_platform_interval(data) + left, right, closed = intervals_to_interval_bounds(data) + return cls.from_arrays(left, right, closed, copy=False) + + @classmethod + def from_tuples(cls, data, closed='right', copy=False): + """ + Construct an IntervalIndex from a list/array of tuples + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1,2)]) + IntervalIndex([(0, 1], (1, 2]], + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects + """ + if len(data): + left, right = [], [] + else: + left = right = data + + for d in data: + if isna(d): + lhs = rhs = np.nan + else: + lhs, rhs = d + left.append(lhs) + right.append(rhs) + + # TODO + # if we have nulls and we previous had *only* + # integer data, then we have changed the dtype + + return cls.from_arrays(left, right, closed, copy=False) + + def _validate(self): + """ + Verify that the IntervalIndex is valid. + """ + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid options for 'closed': {closed}" + .format(closed=self.closed)) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_mask = notna(self.left) + right_mask = notna(self.right) + if not (left_mask == right_mask).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_mask] <= self.right[left_mask]).all(): + raise ValueError('left side of interval must be <= right side') + self._mask = ~left_mask + + # --------- + # Interface + # --------- + def __iter__(self): + return iter(self.values) + + def __len__(self): + return len(self.left) + + def __getitem__(self, value): + mask = self.isna() + if is_scalar(mask) and mask: + return self.fill_value + + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, Index): + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + def _shallow_copy(self, left=None, right=None): + from pandas.core.indexes.interval import IntervalIndex + + if left is None: + + # no values passed + # XXX: is ^ right? Or does that mean just left wasn't passed? + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalIndex + # or array of Intervals + if not isinstance(left, (type(self), IntervalIndex)): + left = type(self).from_intervals(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + return self._simple_new(left, right, closed=self.closed, + verify_integrity=False) + + @classmethod + def concat_same_type(cls, to_concat): + closed = set(interval.closed for interval in to_concat) + if len(closed) != 1: + raise ValueError("Intervals must all be closed on the same side.") + closed = closed.pop() + + # TODO: avoid intermediate list + left = np.concatenate([interval.left for interval in to_concat]) + right = np.concatenate([interval.right for interval in to_concat]) + return cls._simple_new(left, right, closed=closed, copy=False) + + # TODO: doc + def copy(self, deep=False): + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + closed = self.closed + return type(self).from_arrays(left, right, closed=closed) + + def formatting_values(self): + return self.values + + def get_values(self): + return self.values + + def isna(self): + return isna(self.left) + + def nbytes(self): + # XXX: https://github.com/pandas-dev/pandas/issues/19209 + return self.values.nbytes + + def take(self, indices, axis=0, allow_fill=True, fill_value=None, + **kwargs): + nv.validate_take(tuple(), kwargs) + indices = _ensure_platform_int(indices) + left, right = self.left, self.right + + if fill_value is None: + fill_value = self._na_value + mask = indices == -1 + + if not mask.any(): + # we won't change dtype here in this case + # if we don't need + allow_fill = False + + taker = lambda x: x.take(indices, allow_fill=allow_fill, + fill_value=fill_value) + + try: + new_left = taker(left) + new_right = taker(right) + except ValueError: + + # we need to coerce; migth have NA's in an + # integer dtype + new_left = taker(left.astype(float)) + new_right = taker(right.astype(float)) + + return self._shallow_copy(new_left, new_right) + + take_nd = take + + def _format_data(self): + + # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical + n = len(self) + max_seq_items = min((get_option( + 'display.max_seq_items') or n) // 10, 10) + + formatter = str + + if n == 0: + summary = '[]' + elif n == 1: + first = formatter(self[0]) + summary = '[{first}]'.format(first=first) + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[{first}, {last}]'.format(first=first, last=last) + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + summary = '[{head} ... {tail}]'.format( + head=', '.join(head), tail=', '.join(tail)) + else: + head = [] + tail = [formatter(x) for x in self] + summary = '[{tail}]'.format(tail=', '.join(tail)) + + return summary + + def _format_space(self): + space = ' ' * (len(self.__class__.__name__) + 1) + return "\n{space}".format(space=space) + + @property + def left(self): + """ + Return the left endpoints of each Interval in the IntervalIndex as + an Index + """ + return self._left + + @property + def right(self): + """ + Return the right endpoints of each Interval in the IntervalIndex as + an Index + """ + return self._right + + @property + def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither + """ + return self._closed + + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalIndex + """ + try: + return self.right - self.left + except TypeError: + # length not defined for some types, e.g. string + msg = ('IntervalIndex contains Intervals without defined length, ' + 'e.g. Intervals with string endpoints') + raise TypeError(msg) + + def __repr__(self): + return "{}({})".format(self.__class__.__name__, self._format_data()) + + @property + def values(self): + """ + Return the IntervalIndex's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self.isna() + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + @classmethod + def _scalar_data_error(cls, data): + # TODO: array-mixin + raise ScalarDataError( + '{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, repr(data)) + ) + + def slice(self, slicer): + left = self.left[slicer] + right = self.right[slicer] + return self._simple_new(left, right, closed=self.closed, + verify_integrity=False) + + +def maybe_convert_platform_interval(values): + """ + Try to do platform conversion, with special casing for IntervalIndex. + Wrapper around maybe_convert_platform that alters the default return + dtype in certain cases to be compatible with IntervalIndex. For example, + empty lists return with integer dtype instead of object dtype, which is + prohibited for IntervalIndex. + + Parameters + ---------- + values : array-like + + Returns + ------- + array + """ + if isinstance(values, (list, tuple)) and len(values) == 0: + # GH 19016 + # empty lists/tuples get object dtype by default, but this is not + # prohibited for IntervalIndex, so coerce to integer instead + return np.array([], dtype=np.int64) + return maybe_convert_platform(values) diff --git a/pandas/core/period.py b/pandas/core/period.py new file mode 100644 index 0000000000000..03099cc47c73a --- /dev/null +++ b/pandas/core/period.py @@ -0,0 +1,469 @@ +"""Extension array for Period data +""" +import numpy as np + +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core import common as com +from pandas.core.extensions import ExtensionArray +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin +from pandas._libs import tslib, iNaT +from pandas._libs.lib import infer_dtype +from pandas._libs.tslibs import period +from pandas._libs.tslibs.period import ( + IncompatibleFrequency, + Period, + _quarter_to_myear, + _validate_end_alias, +) +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + pandas_dtype, + _ensure_object, +) +import pandas.tseries.frequencies as frequencies +from pandas.tseries.frequencies import get_freq_code as _gfc + + +def dt64arr_to_periodarr(data, freq, tz): + # TODO: the reverse is in period. move there? + if data.dtype != np.dtype('M8[ns]'): + raise ValueError('Wrong dtype: %s' % data.dtype) + + freq = Period._maybe_convert_freq(freq) + base, mult = _gfc(freq) + return period.dt64arr_to_periodarr(data.view('i8'), base, tz) + + +def to_period(data): + data = np.asanyarray(data) + if data.dtype != int: + raise ValueError(data.dtype) + + return data + + +def _make_field_arrays(*fields): + length = None + for x in fields: + if isinstance(x, (list, np.ndarray, ABCSeries)): + if length is not None and len(x) != length: + raise ValueError('Mismatched Period array lengths') + elif length is None: + length = len(x) + + arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) for x in fields] + + return arrays + + +def _range_from_fields(year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None, freq=None): + if hour is None: + hour = 0 + if minute is None: + minute = 0 + if second is None: + second = 0 + if day is None: + day = 1 + + ordinals = [] + + if quarter is not None: + if freq is None: + freq = 'Q' + base = frequencies.FreqGroup.FR_QTR + else: + base, mult = _gfc(freq) + if base != frequencies.FreqGroup.FR_QTR: + raise AssertionError("base must equal FR_QTR") + + year, quarter = _make_field_arrays(year, quarter) + for y, q in zip(year, quarter): + y, m = _quarter_to_myear(y, q, freq) + val = period.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) + ordinals.append(val) + else: + base, mult = _gfc(freq) + arrays = _make_field_arrays(year, month, day, hour, minute, second) + for y, mth, d, h, mn, s in zip(*arrays): + ordinals.append(period.period_ordinal( + y, mth, d, h, mn, s, 0, 0, base)) + + return np.array(ordinals, dtype=np.int64), freq + + +def _get_ordinal_range(start, end, periods, freq, mult=1): + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + + if freq is not None: + _, mult = _gfc(freq) + + if start is not None: + start = Period(start, freq) + if end is not None: + end = Period(end, freq) + + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) + + if is_start_per and is_end_per and start.freq != end.freq: + raise ValueError('start and end must have same freq') + if (start is tslib.NaT or end is tslib.NaT): + raise ValueError('start and end must not be NaT') + + if freq is None: + if is_start_per: + freq = start.freq + elif is_end_per: + freq = end.freq + else: # pragma: no cover + raise ValueError('Could not infer freq from start/end') + + if periods is not None: + periods = periods * mult + if start is None: + data = np.arange(end.ordinal - periods + mult, + end.ordinal + 1, mult, + dtype=np.int64) + else: + data = np.arange(start.ordinal, start.ordinal + periods, mult, + dtype=np.int64) + else: + data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) + + return data, freq + + +# XXX: We inherit from DatetimeIndexOpsMixin to get comparison, arithmetics +# This should be split into an DatetimeArrayOpsMixin, and then any Index +# version that just does index-stuff + + +class PeriodArray(DatetimeIndexOpsMixin, ExtensionArray): + dtype = PeriodDtype() + ndim = 1 + can_hold_na = True + _dtype = None + + def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, + periods=None, copy=False, name=None, tz=None, dtype=None, + **kwargs): + from pandas.core.indexes.datetimes import DatetimeIndex + from pandas.core.indexes.numeric import Int64Index + from pandas.core.indexes.period import PeriodIndex + + if periods is not None: + if is_float(periods): + periods = int(periods) + elif not is_integer(periods): + msg = 'periods must be a number, got {periods}' + raise TypeError(msg.format(periods=periods)) + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError('dtype must be PeriodDtype') + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + msg = 'specified freq and dtype are different' + raise IncompatibleFrequency(msg) + + # coerce freq to freq object, otherwise it can be coerced elementwise + # which is slow + if freq: + freq = Period._maybe_convert_freq(freq) + + if data is None: + if ordinal is not None: + data = np.asarray(ordinal, dtype=np.int64) + else: + data, freq = cls._generate_range(start, end, periods, + freq, kwargs) + return cls._from_ordinals(data, freq=freq) + + if isinstance(data, PeriodIndex): + data = data._data + + if isinstance(data, cls): + if freq is None or freq == data.freq: # no freq change + freq = data.freq + data = data._data + else: + base1, _ = _gfc(data.freq) + base2, _ = _gfc(freq) + data = period.period_asfreq_arr(data._data, + base1, base2, 1) + return cls._simple_new(data, freq=freq) + + # not array / index + if not isinstance(data, (np.ndarray, PeriodIndex, + DatetimeIndex, Int64Index)): + if is_scalar(data) or isinstance(data, Period): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + data = np.asarray(data) + + # datetime other than period + if is_datetime64_dtype(data.dtype): + data = dt64arr_to_periodarr(data, freq, tz) + return cls._from_ordinals(data, freq=freq) + + # check not floats + if infer_dtype(data) == 'floating' and len(data) > 0: + raise TypeError("PeriodIndex does not allow " + "floating point in construction") + + # anything else, likely an array of strings or periods + data = _ensure_object(data) + freq = freq or period.extract_freq(data) + data = period.extract_ordinals(data, freq) + return cls._from_ordinals(data, freq=freq) + + @classmethod + def _generate_range(cls, start, end, periods, freq, fields): + if freq is not None: + freq = Period._maybe_convert_freq(freq) + + field_count = len(fields) + if com._count_not_none(start, end) > 0: + if field_count > 0: + raise ValueError('Can either instantiate from fields ' + 'or endpoints, but not both') + subarr, freq = _get_ordinal_range(start, end, periods, freq) + elif field_count > 0: + subarr, freq = _range_from_fields(freq=freq, **fields) + else: + raise ValueError('Not enough parameters to construct ' + 'Period range') + + return subarr, freq + + @classmethod + def _simple_new(cls, values, freq=None): + """ + Values can be any type that can be coerced to Periods. + Ordinals in an ndarray are fastpath-ed to `_from_ordinals` + """ + if not is_integer_dtype(values): + values = np.array(values, copy=False) + if len(values) > 0 and is_float_dtype(values): + raise TypeError("PeriodArray can't take floats") + return cls(values, freq=freq) + + return cls._from_ordinals(values, freq) + + @classmethod + def _from_ordinals(cls, values, freq=None): + """ + Values should be int ordinals + `__new__` & `_simple_new` cooerce to ordinals and call this method + """ + + values = np.array(values, dtype='int64', copy=False) + + result = object.__new__(cls) + result._data = values + if freq is None: + raise ValueError('freq is not specified and cannot be inferred') + result.freq = Period._maybe_convert_freq(freq) + return result + + def __iter__(self): + return iter(self._data) + + def __len__(self): + return len(self._data) + + def __repr__(self): + values = self._format_values() + return "PeriodArray({}, freq={}, dtype={})".format( + values, self.freq, self.dtype + ) + + def __getitem__(self, item): + if is_scalar(item): + return self._box_func(self._data[item]) + else: + values = self._data[item] + return self._simple_new(values, self.freq) + + @property + def dtype(self): + if self._dtype is None: + self._dtype = PeriodDtype(self.freq) + return self._dtype + + @property + def shape(self): + return (len(self),) + + @property + def values(self): + return self.astype(object) + + @property + def asi8(self): + return self._data.view('i8') + + @property + def _box_func(self): + return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + + def _format_values(self): + return np.array(['%s' % x for x in self.values], dtype='object') + + def formatting_values(self): + return self._format_values() + + def astype(self, dtype, copy=True, how='start'): + dtype = pandas_dtype(dtype) + if is_object_dtype(dtype): + return np.array([Period._from_ordinal(p, self.freq) + for p in self], dtype='object') + else: + raise ValueError('invalid dtype') + + def copy(self): + return self._from_ordinals(self._data.copy(), freq=self.freq) + + def isna(self): + return self.asi8 == iNaT + + def nbytes(self): + return self._data.nbytes + + def take(self, indexer, allow_fill=True, fill_value=None): + # XXX: is take supposed to be a view? + return self._from_ordinals(self._data.take(indexer), self.freq) + + take_nd = take + + @classmethod + def concat_same_type(cls, to_concat): + dtype = to_concat[0].dtype + if not all(other.dtype == dtype for other in to_concat): + raise TypeError("All frequencies must match") + values = np.concatenate([other._data for other in to_concat]) + return cls._from_ordinals(values, freq=to_concat[0].freq) + + def get_values(self): + return self._data + + @classmethod + def _scalar_data_error(cls, data): + # TODO: array-mixin + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) + + def _get_attributes_dict(self): + # TODO: from indexes.base, needed for ops, can remove + return {} + + def view(self, cls=None): + return self._data.view(cls) + + def equals(self, other): + if not isinstance(other, type(self)): + return False + return (self.freq == other.freq and + len(self) == len(other) and + np.all(self._data == other._data)) + + def slice(self, slicer): + return self._from_ordinals(self._data[slicer], freq=self.freq) + + def asfreq(self, freq=None, how='E'): + """ + Convert the PeriodArray to the specified frequency `freq`. + + Parameters + ---------- + + freq : str + a frequency + how : str {'E', 'S'} + 'E', 'END', or 'FINISH' for end, + 'S', 'START', or 'BEGIN' for start. + Whether the elements should be aligned to the end + or start within pa period. January 31st ('END') vs. + Janury 1st ('START') for example. + + Returns + ------- + + new : PeriodArray with the new frequency + + Examples + -------- + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx + + [2010, ..., 2015] + Length: 6, Freq: A-DEC + + >>> pidx.asfreq('M') + + [2010-12, ..., 2015-12] + Length: 6, Freq: M + + >>> pidx.asfreq('M', how='S') + + [2010-01, ..., 2015-01] + Length: 6, Freq: M + """ + how = _validate_end_alias(how) + + freq = Period._maybe_convert_freq(freq) + + base1, mult1 = _gfc(self.freq) + base2, mult2 = _gfc(freq) + + asi8 = self.asi8 + # mult1 can't be negative or 0 + end = how == 'E' + if end: + ordinal = asi8 + mult1 - 1 + else: + ordinal = asi8 + + new_data = period.period_asfreq_arr(ordinal, base1, base2, end) + + # XXX: PeriodIndex could cache this. We can't, so this will be slower. + mask = self.isna() + if isna(self).any(): + new_data[mask] = tslib.iNaT + + return self._from_ordinals(new_data, freq=freq) + + # Pickling + def __getnewargs__(self): + # values, oridinal, freq + return (None, self._data, self.freq) + + def __getstate__(self): + return {'ordinal': self._data, 'freq': self.freq} + + def __setstate__(self, state): + self.__dict__.update(state) + + +PeriodArray._add_datetimelike_methods() diff --git a/pandas/core/series.py b/pandas/core/series.py index 71cded4f9c888..440bdc01e2539 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -37,7 +37,7 @@ maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike - +from pandas.core.extensions import ExtensionArray from pandas.core.common import (is_bool_indexer, _default_index, _asarray_tuplesafe, @@ -239,6 +239,11 @@ def __init__(self, data=None, index=None, dtype=None, name=None, copy=copy) elif copy: data = data.copy() + elif isinstance(data, ExtensionArray): + # data = data._block_type(data, index) + if copy: + data = data.copy() + data = SingleBlockManager(data, index, fastpath=True) else: data = _sanitize_array(data, index, dtype, copy, raise_cast_failure=True) @@ -2523,7 +2528,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): return f(self) # row-wise access - if is_extension_type(self.dtype): + if is_extension_type(self): mapped = self._values.map(f) else: values = self.astype(object).values @@ -3217,6 +3222,8 @@ def _try_cast(arr, take_fast_path): start, stop, step = get_range_parameters(data) arr = np.arange(start, stop, step, dtype='int64') subarr = _try_cast(arr, False) + elif isinstance(data, ExtensionArray): + subarr = data else: subarr = _try_cast(data, False) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 9b2650359bf68..b57e14f57e606 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -9,6 +9,7 @@ import pandas as pd from pandas.core.base import PandasObject +from pandas.core.extensions import ExtensionArray from pandas import compat from pandas.compat import range @@ -161,7 +162,7 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): fill_value=fill_value, dtype=dtype) -class SparseArray(PandasObject, np.ndarray): +class SparseArray(PandasObject, np.ndarray, ExtensionArray): """Data structure for labeled, sparse floating point 1-D data Parameters @@ -272,6 +273,14 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + @property + def _block_type(self): + from pandas.core.internals import SparseBlock + return SparseBlock + + def isna(self): + return np.isnan(self) + def __array_wrap__(self, out_arr, context=None): """ NumPy calls this method when ufunc is applied diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 8962eb90be828..821c7858c7a5c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -122,7 +122,7 @@ def test_api(self): class TestApi(Base): - allowed = ['types'] + allowed = ['types', 'extensions'] def test_api(self): diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index bfec229d32b22..e63e52179d417 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -503,28 +503,26 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(pd.Index([True, False])) -@pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) -]) -def test_is_extension_type(check_scipy): - assert not com.is_extension_type([1, 2, 3]) - assert not com.is_extension_type(np.array([1, 2, 3])) - assert not com.is_extension_type(pd.DatetimeIndex([1, 2, 3])) +@pytest.mark.parametrize("name, obj, is_extension", [ + ('list', [1, 2, 3], False), + ('ndarray', np.array([1, 2, 3]), False), + ('datetimeindex', pd.DatetimeIndex([1, 2, 3]), False), # ? + ('category', pd.Categorical([1, 2, 3]), True), + ('series[categorical]', pd.Series(pd.Categorical([1, 2, 3])), True), + ('sparse', pd.SparseArray([1, 2, 3]), True), + ('series[sparse]', pd.SparseSeries([1, 2, 3]), True), + ('datetime-with-tz', pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"), True), - cat = pd.Categorical([1, 2, 3]) - assert com.is_extension_type(cat) - assert com.is_extension_type(pd.Series(cat)) - assert com.is_extension_type(pd.SparseArray([1, 2, 3])) - assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) - assert com.is_extension_type(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) +]) +def test_is_extension_type(name, obj, is_extension): + result = com.is_extension_type(obj) + assert result == is_extension - dtype = DatetimeTZDtype("ns", tz="US/Eastern") - s = pd.Series([], dtype=dtype) - assert com.is_extension_type(s) - if check_scipy: - import scipy.sparse - assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) +@td.skip_if_no_scipy +def test_is_extension_type_scipy(): + import scipy.sparse + assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) def test_is_complex_dtype(): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c824f0026af50..fa61f4c432dea 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -327,6 +327,8 @@ def test_reset_index_with_intervals(self): assert_frame_equal(result, expected) result2 = result.reset_index() + # XXX: Handle extension index things + # This will fail assert_frame_equal(result2, original) def test_set_index_multiindexcolumns(self): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index b6d49c9e7ba19..bd1a280bfda4c 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -10,6 +10,7 @@ from pandas.tests.indexes.common import Base import pandas.util.testing as tm import pandas as pd +from pandas.core.interval import IntervalArray @pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) @@ -48,8 +49,8 @@ def create_index_with_nan(self, closed='right'): def test_constructors(self, data, closed, name): left, right = data[:-1], data[1:] ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] - expected = IntervalIndex._simple_new( - left=left, right=right, closed=closed, name=name) + arr = IntervalArray._simple_new(left=left, right=right, closed=closed) + expected = IntervalIndex._simple_new(arr, name=name) # validate expected assert expected.closed == closed @@ -1225,3 +1226,10 @@ def test_to_tuples_na(self, tuples, na_tuple): assert all(isna(x) for x in result_na) else: assert isna(result_na) + + def test_from_interval_array(self): + breaks = list(range(10)) + arr = IntervalArray.from_breaks(breaks) + result = IntervalIndex(arr) + expected = IntervalIndex.from_breaks(breaks) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index c8724b2a3bc91..269e36a2ce17e 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -153,3 +153,8 @@ def test_astype_asfreq(self): exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') tm.assert_index_equal(pi1.asfreq('3M'), exp) tm.assert_index_equal(pi1.astype('period[3M]'), exp) + + def test_shallow_copy_asfreq(self): + result = PeriodIndex(['2017'], freq='D')._shallow_copy(freq='M') + expected = PeriodIndex(['2017'], freq='M') + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/test_extension_arrays/__init__.py b/pandas/tests/test_extension_arrays/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/test_extension_arrays/base.py b/pandas/tests/test_extension_arrays/base.py new file mode 100644 index 0000000000000..bfc953de0bca7 --- /dev/null +++ b/pandas/tests/test_extension_arrays/base.py @@ -0,0 +1,41 @@ +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseArrayTests: + + def test_series_constructor(self, test_data): + result = pd.Series(test_data) + assert result.dtype == test_data.dtype + assert len(result) == len(test_data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_dataframe_constructor(self, test_data): + result = pd.DataFrame({"A": test_data}) + assert result.dtypes['A'] == test_data.dtype + assert result.shape == (len(test_data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_concat(self, test_data): + result = pd.concat([ + pd.Series(test_data), + pd.Series(test_data), + ], ignore_index=True) + assert len(result) == len(test_data) * 2 + + def test_iloc(self, test_data): + ser = pd.Series(test_data) + result = ser.iloc[:4] + expected = pd.Series(test_data[:4]) + tm.assert_series_equal(result, expected) + + def test_loc(self, test_data): + ser = pd.Series(test_data) + result = ser.loc[[0, 1, 2, 3]] + expected = pd.Series(test_data[:4]) + tm.assert_series_equal(result, expected) + + def test_repr(self, test_data): + ser = pd.Series(test_data) + repr(ser) diff --git a/pandas/tests/test_extension_arrays/test_interval.py b/pandas/tests/test_extension_arrays/test_interval.py new file mode 100644 index 0000000000000..aa6e2b3d7511d --- /dev/null +++ b/pandas/tests/test_extension_arrays/test_interval.py @@ -0,0 +1,16 @@ +import pytest + +import pandas as pd +from pandas.core.interval import IntervalArray + +from .base import BaseArrayTests + + +@pytest.fixture +def test_data(): + """Length-100 PeriodArray for semantics test.""" + return IntervalArray(pd.interval_range(0, periods=100)) + + +class TestPeriod(BaseArrayTests): + pass diff --git a/pandas/tests/test_extension_arrays/test_json.py b/pandas/tests/test_extension_arrays/test_json.py new file mode 100644 index 0000000000000..b39db9bc125fd --- /dev/null +++ b/pandas/tests/test_extension_arrays/test_json.py @@ -0,0 +1,88 @@ +import itertools +import json +import random +import string +import sys + +import numpy as np +import pytest + +from pandas.core.extensions import ExtensionArray, ExtensionDtype + +from .base import BaseArrayTests + + +class JSONType(ExtensionDtype): + name = 'json' + base = None + kind = 'O' + + +class JSONArray(ExtensionArray): + dtype = JSONType() + fill_value = [] + can_hold_na = True + + def __init__(self, data): + if isinstance(data, str): + data = json.loads(data) + elif isinstance(data, type(self)): + data = data.data + assert isinstance(data, list), "'data' must be a list of records." + self.data = data + + def __getitem__(self, item): + if isinstance(item, slice): + result = self.data[item] + else: + result = [self.data[i] for i in item] + return type(self)(result) + + def __iter__(self): + return iter(self.data) + + def __len__(self): + return len(self.data) + + @property + def nbytes(self): + return sum(sys.getsizeof(x) for x in self) + + def isna(self): + return np.array(x == [] for x in self) + + def take(self, indexer, allow_fill=True, fill_value=None): + return type(self)(self[indexer]) + + take_nd = take + + def formatting_values(self): + return np.array(self.data).ravel() + + def get_values(self): + return np.array(self.data) + + def slice(self, slicer): + return self[slicer] + + @classmethod + def concat_same_type(cls, to_concat): + return cls(list(itertools.chain(to_concat))) + + def copy(self, deep=False): + data = self.data + if deep: + data = self.data.copy() + return type(self)(data) + + +@pytest.fixture +def test_data(): + choices = list(string.ascii_letters) + list(range(100)) + data = [dict([random.choices(choices, k=2)]) + for _ in range(100)] + return JSONArray(data) + + +class TestJSONArray(BaseArrayTests): + pass diff --git a/pandas/tests/test_extension_arrays/test_period.py b/pandas/tests/test_extension_arrays/test_period.py new file mode 100644 index 0000000000000..59aa5805da232 --- /dev/null +++ b/pandas/tests/test_extension_arrays/test_period.py @@ -0,0 +1,69 @@ +import pytest + +import numpy as np +import pandas as pd +import pandas.util.testing as tm +from pandas.core.period import PeriodArray + +from .base import BaseArrayTests + + +@pytest.fixture +def test_data(): + """Length-100 PeriodArray for semantics test.""" + return PeriodArray(pd.period_range("2000", periods=100)) + + +class TestPeriod(BaseArrayTests): + pass + + +class TestArray: + + def test_init(self): + arr = PeriodArray([2017, 2018], freq='A') + assert isinstance(arr, PeriodArray) + + def test_concat(self): + p1 = PeriodArray([2017, 2018], freq='A') + p2 = PeriodArray([2019, 2020], freq='A') + result = pd.concat([pd.Series(p1), pd.Series(p2)], ignore_index=True) + expected = pd.Series(PeriodArray([2017, 2018, 2019, 2020], freq='A')) + tm.assert_series_equal(result, expected) + + def test_equals(self): + p1 = PeriodArray([2017, 2018], freq='A') + p2 = PeriodArray([2017, 2018], freq='A') + assert p1.equals(p2) + + @pytest.mark.parametrize('other', [ + 2017, + [2017, 2018], + PeriodArray([2016, 2017], freq='A'), + PeriodArray([2017, 2018], freq='A-JAN'), + PeriodArray([2017, 2018, 2019], freq='A'), + ]) + def test_equals_unequal(self, other): + p1 = PeriodArray([2017, 2018], freq='A') + assert not p1.equals(other) + + def test_getitem(self): + p1 = PeriodArray([2017, 2018, 2019], freq='A') + result = p1[0] + expected = pd.Period(2017, freq='A') + assert result == expected + + result = p1[[0, 1]] + expected = PeriodArray([2017, 2018], freq='A') + assert result.equals(expected) + + result = p1[slice(2)] + assert result.equals(expected) + + result = p1[np.array([True, True, False])] + assert result.equals(expected) + + def test_isna(self): + result = PeriodArray(['2018', 'NaT'], freq='D').isna() + expected = np.array([False, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_extension_arrays/test_pyarrow.py b/pandas/tests/test_extension_arrays/test_pyarrow.py new file mode 100644 index 0000000000000..3986bb92f256c --- /dev/null +++ b/pandas/tests/test_extension_arrays/test_pyarrow.py @@ -0,0 +1,102 @@ +import collections + +import pyarrow as pa +import pytest + +import numpy as np +import pandas as pd +from pandas.core.extensions import ExtensionArray, ExtensionDtype +from .base import BaseArrayTests + + +class MyDtypeType(type): + pass + + +class ArrowDtype(ExtensionDtype): + _can_hold_na = True + type = MyDtypeType + base = None + name = 'pa64' + arrow_type = pa.int64() + + +class ArrowArray(ExtensionArray): + dtype = ArrowDtype() + ndim = 1 + can_hold_na = True + + def __init__(self, values): + if not isinstance(values, pa.Array): + values = pa.array(values) + assert values.type == self.dtype.arrow_type + self.data = values + + def __iter__(self): + return iter(self.data) + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + result = self.data[item] + if isinstance(item, (slice, collections.Sequence)): + return type(self)(result) + else: + return result + + @property + def nbytes(self): + return 64 * len(self) + + @property + def shape(self): + return (len(self),) + + def take(self, indexer, allow_fill=True, fill_value=None): + return type(self)(self.data.to_pandas().take(indexer)) + + take_nd = take + + def copy(self): + # TODO: Jira for pa.array(pyarrow.array) + return pa.array(self.data.to_pandas()) + + def isna(self): + # https://github.com/apache/arrow/pull/1378 + return pd.isna(self.data.to_pandas()) + + @classmethod + def concat_same_type(cls, to_concat): + return cls(np.concatenate([arr.data.to_pandas() for arr in to_concat])) + + def get_values(self): + return self.data + + def formatting_values(self): + return self.data.to_pandas() + + def slice(self, indexer): + return self[indexer] + + +@pytest.fixture +def test_data(): + """Length-100 int64 arrow array for semantics test.""" + return ArrowArray(np.arange(100)) + + +class TestArrow(BaseArrayTests): + def test_iloc(self, test_data): + ser = pd.Series(test_data) + result = ser.iloc[:4] + expected = test_data[:4] + assert isinstance(result, pd.Series) + assert result.values.data.equals(expected.data) + + def test_loc(self, test_data): + ser = pd.Series(test_data) + result = ser.loc[[0, 1, 2, 3]] + expected = test_data[:4] + assert isinstance(result, pd.Series) + assert result.values.data.equals(expected.data)