diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 37f99bd344e6c..a05018edf0798 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -15,6 +15,7 @@ ABCIndexClass) from .inference import is_string_like from .inference import * # noqa +from .costum_dtypes import NumpyDtypeWithMetadata, NumpyDtypeWithMetadataType _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name @@ -508,6 +509,15 @@ def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) +def is_numpy_dtype_with_metadata(arr_or_dtype): + """ + Check whether the array or dtype is an instance of NumpyDtypeWithMetadata + """ + if arr_or_dtype is None: + return False + return NumpyDtypeWithMetadata.is_dtype(arr_or_dtype) + + def is_string_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of the string dtype. @@ -1615,6 +1625,8 @@ def is_extension_type(arr): return True elif is_datetimetz(arr): return True + elif is_numpy_dtype_with_metadata(arr): + return True return False @@ -1717,6 +1729,8 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, IntervalDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, NumpyDtypeWithMetadata): + return arr_or_dtype elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -1762,6 +1776,8 @@ def _get_dtype_type(arr_or_dtype): return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType + elif isinstance(arr_or_dtype, NumpyDtypeWithMetadata): + return NumpyDtypeWithMetadataType elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtypeType @@ -1879,6 +1895,8 @@ def pandas_dtype(dtype): if isinstance(dtype, DatetimeTZDtype): return dtype + elif isinstance(dtype, NumpyDtypeWithMetadata): + return dtype elif isinstance(dtype, PeriodDtype): return dtype elif isinstance(dtype, CategoricalDtype): diff --git a/pandas/core/dtypes/costum_dtypes.py b/pandas/core/dtypes/costum_dtypes.py new file mode 100644 index 0000000000000..d3cdf02077ebb --- /dev/null +++ b/pandas/core/dtypes/costum_dtypes.py @@ -0,0 +1,187 @@ +""" +This module contains an interface that external libraries can use to define +their own dtypes compatible with pandas (but NOT NUMPY). +""" + +import numpy as np + +from .dtypes import ExtensionDtype + +class NumpyDtypeWithMetadataType(type): # Do we need this? + """ + The type of NumpyDtypeWithMetadata + """ + pass + +class NumpyDtypeWithMetadata(ExtensionDtype): + + """ + An ExtentionDtype for data where the data + can be stored in a numpy dtype, but the dtype itself + contains meta-data and may redefine arithmetic operations. + + To properly implement caching behaviour, + you might have to implement a __new__ method. + """ + type = NumpyDtypeWithMetadataType + # What attributes should be stored during pickling? + # If this is provided, you usually do not have to + # override __getstate__ + _metadata = [] + + def base(self): + """ + In what numpy-compatible dtype the actual data is stored. + + Example: np.dtype('f8') + """ + raise NotImplementedError("'base' must be implemented by subclass " + "(probably as class-level variable)") + + + @classmethod + def construct_from_string(cls, string): + """ attempt to construct this type from a string, raise a TypeError if + it's not possible """ + raise NotImplementedError("'construct_from_string' must be implemented by subclass.") + + def operation_typecompatible(self, operation_name, other_dtype, is_left=True): + """ + Is the desired operation possible between this dtype and other_dtype? + + Parameters + ---------- + opertation_name: The name of the desired operation, e.g. '__eq__' + other_dtype: The dtype of the other operand + is_left: If this dtype is on the left-hand side of the binary operation. + + Returns + ------- + Boolean or NotImplemented + """ + return False + + def get_operation_wrapper(self): + """ + This is called by `pandas.ops._Op.get_op` to get an object + responsible for type-coercion (which should have the same interface as _Op) + + Returns + ------- + A class implementing the same interface as pandas.ops._Op or None + It should return None, if the default _Op class should be used. + """ + return None + + def to_dtype(self, data): + """ + Convert arbitrary data to this dtype. + + Override this, if you need any additional conversions. + + Parameters + ---------- + data: array-like + + Returns + ------- + An numpy array with the same dtype as self.base + """ + return np.asarray(data, dtype = self.base) + +class AlwaysSame(NumpyDtypeWithMetadata): + """ + This is an example how a library could implement a + subclass of NumpyDtypeWithMetadata, but is it (except for testing) + not useful for anything else. + """ + _metadata = [ "_target_value", "base"] + def __new__(cls, target_value=None): + if target_value is None: + #We are unpickling + return object.__new__(cls) + try: + return cls._cache[target_value] + except KeyError: + d = object.__new__(cls) + d._target_value = target_value + # In this case, we set the base numpy dtype upon object construction. + d.base = np.dtype(type(target_value)) #Raises, if target_value is not a simple number + cls._cache[target_value] = d + return d + + def __hash__(self): + return hash(self._target_value) + + def __unicode__(self): + return "always[{}]".format(repr(self._target_value)) + + def __setstate__(self, state): + try: + self._target_value = state["_target_value"] + except KeyError: + print("state", state) + raise + self.base = np.dtype(type(self._target_value)) + + def __eq__(self, other): + if not isinstance(other, AlwaysSame): + return NotImplemented + return self._target_value == other._target_value + + def to_dtype(self, data): + """ + Fill the array with the target value. + """ + # Since performance is irrelevant for this Test-dtype, we + # do not try to modify data in-place + data = np.ones(np.shape(data), dtype=self.base) + data = data*self._target_value + return data + + def get_operation_wrapper(self): + """ + This is called by `pandas.ops._Op.get_op` to get an object + responsible for type-coercion (which should have the same interface as _Op) + + Returns + ------- + A class implementing the same interface as pandas.ops._Op or None + It should return None, if the default _Op class should be used. + """ + class AlwaysSameOp(): + dtype = None + fill_value = self._target_value + def __init__(self, left, right, name, na_op): + self.left = left + self.right = right + + self.name = name + self.na_op = na_op + + # Here, a conversion of left and right to lvalues and rvalues could take place. + # lvalues must be a type that has the desired operator defined. + self.lvalues = left + self.rvalues = right + return None + def wrap_results(self, results): + print("l,r ", type(self.left), type(self.right)) + # Comparison operators return dtype bool. + if self.name in ["__eq__", "__lt__", "__gt__", "__ge__", "__le__", "__ne__"]: + return results + # All other operators return dtype AlwaysSame + if isinstance(self.left.dtype, AlwaysSame): + target_dtype = self.left.dtype + else: + assert isinstance(self.right.dtype, AlwaysSame) + target_dtype = self.right.dtype + return target_dtype.to_dtype(results) + return AlwaysSameOp + + def operation_typecompatible(self, name, other, is_left=True): + if isinstance(other, AlwaysSame): + if other._target_value != self._target_value: + if type(other) != AlwaysSame: + return NotImplemented #Allow + return False + return True diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 4e08e1483d617..018440d3fc6e3 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -29,6 +29,7 @@ is_datetimelike_v_numeric, is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, + is_numpy_dtype_with_metadata, is_datetime64_dtype, is_datetime64tz_dtype, is_bool_dtype, is_datetimetz, is_list_like, @@ -330,12 +331,37 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) + if is_numpy_dtype_with_metadata(left): + left_compatible = left.dtype.operation_typecompatible(name, right.dtype, is_left=True) + if left_compatible is not NotImplemented: + if left_compatible: + op_class = left.dtype.get_operation_wrapper() + if op_class is not None: + return op_class(left, right, name, na_op) + else: + return _Op(left, right, name, na_op) + else: + raise TypeError("Operation {} not permitted between " + "dtype {} and type {}".format(name, left.dtype, + right.dtype)) + # left is either not a NumpyDtypeWithMetadata or did not implement the Operation. + if is_numpy_dtype_with_metadata(right): + if right.dtype.operation_typecompatible(name, left.dtype, is_left=False): + op_class = right.dtype.get_operation_wrapper() + if op_class is not None: + return op_class(left, right, name, na_op) + else: + return _Op(left, right, name, na_op) + else: + raise TypeError("Operation {} not permitted between " + "dtype {} and type {}".format(left.dtype, + right.dtype)) + # No NumpyDtypeWithMetadata involved. if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op) - class _TimeOp(_Op): """ Wrapper around Series datetime/time/timedelta arithmetic operations. diff --git a/pandas/core/series.py b/pandas/core/series.py index 60d268c89a9d7..21042d5137363 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,6 +23,7 @@ is_datetimelike, is_datetime64tz_dtype, is_timedelta64_dtype, + is_numpy_dtype_with_metadata, is_list_like, is_hashable, is_iterator, @@ -150,6 +151,7 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin, def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): + print("__init__: data = {}, dtype = {}".format(repr(data), repr(dtype))) # we are called internally, so short-circuit if fastpath: @@ -262,7 +264,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data, fastpath=True) - + if is_numpy_dtype_with_metadata(dtype): + self._extension_dtype = dtype + else: + self._extension_dtype = None self.name = name self._set_axis(0, index, fastpath=True) @@ -343,7 +348,10 @@ def name(self, value): @property def dtype(self): """ return the dtype object of the underlying data """ - return self._data.dtype + if self._extension_dtype is None: + return self._data.dtype + else: + return self._extension_dtype @property def dtypes(self): @@ -2990,7 +2998,6 @@ def _sanitize_array(data, index, dtype=None, copy=False, if dtype is not None: dtype = pandas_dtype(dtype) - if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): @@ -3005,11 +3012,13 @@ def _try_cast(arr, take_fast_path): if take_fast_path: if maybe_castable(arr) and not copy and dtype is None: return arr - try: subarr = maybe_cast_to_datetime(arr, dtype) - if not is_extension_type(subarr): + if not is_extension_type(subarr) and not is_extension_type(dtype): subarr = np.array(subarr, dtype=dtype, copy=copy) + else: + if is_numpy_dtype_with_metadata(dtype): + subarr = dtype.to_dtype(subarr) except (ValueError, TypeError): if is_categorical_dtype(dtype): subarr = Categorical(arr) @@ -3056,6 +3065,7 @@ def _try_cast(arr, take_fast_path): if dtype is not None: try: subarr = _try_cast(data, False) + except Exception: if raise_cast_failure: # pragma: no cover raise @@ -3077,6 +3087,10 @@ def create_from_value(value, index, dtype): subarr = DatetimeIndex([value] * len(index), dtype=dtype) elif is_categorical_dtype(dtype): subarr = Categorical([value] * len(index)) + elif is_numpy_dtype_with_metadata(dtype): + subarr = np.empty(len(index), dtype=dtype.base) + subarr.fill(value) + subarr = dtype.to_dtype(subarr) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype @@ -3084,9 +3098,10 @@ def create_from_value(value, index, dtype): subarr.fill(value) return subarr - # scalar like, GH + print(type(subarr), dir(subarr)) if getattr(subarr, 'ndim', 0) == 0: + print("subarr", type(subarr), subarr) if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a1f323aff7c1a..c294bd09af634 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -336,6 +336,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 2 1960-01-04 """ + + print("to datetime. Box is {}".format(box)) from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2b322431bd301..1b51699a68fa4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1747,6 +1747,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter + if hasattr(values, "values"): + values = values.values if space is None: space = get_option("display.column_space") diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..366bea176719b 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -26,6 +26,7 @@ CategoricalDtype, DatetimeTZDtype, PeriodDtype) +from pandas.core.dtypes.units import DimensionedFloatDtype from pandas.core.dtypes.common import ( is_dtype_equal) from pandas.util import testing as tm @@ -398,6 +399,15 @@ def test_datetimetz_dtype(self): assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object + def test_dimensionedFloat_dtype(self): + dtype = DimensionedFloatDtype('meter') + assert find_common_type([dtype, dtype]) == 'dimensionedFloat[meter]' + + for dtype2 in [DimensionedFloatDtype(unit='seconds'), + np.object, np.int64]: + assert find_common_type([dtype, dtype2]) == np.object + assert find_common_type([dtype2, dtype]) == np.object + def test_period_dtype(self): dtype = PeriodDtype(freq='D') assert find_common_type([dtype, dtype]) == 'period[D]' diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4..74e9ca5b84ebb 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -9,6 +9,8 @@ import pandas.core.dtypes.common as com import pandas.util.testing as tm +from pandas.core.dtypes.units import (DimensionedFloatDtype, + DimensionedFloatDtypeType) class TestPandasDtype(object): @@ -62,7 +64,8 @@ def test_period_dtype(self): integer=np.dtype(np.int64), float=np.dtype(np.float64), object=np.dtype(np.object), - category=com.pandas_dtype('category')) + category=com.pandas_dtype('category'), + dimensionedDT=DimensionedFloatDtype("meter")) @pytest.mark.parametrize('name1,dtype1', @@ -558,6 +561,8 @@ def test_is_complex_dtype(): (PeriodDtype(freq='D'), PeriodDtype(freq='D')), ('period[D]', PeriodDtype(freq='D')), (IntervalDtype(), IntervalDtype()), + ('dimensionedFloat[second]', DimensionedFloatDtype("second")), + (DimensionedFloatDtype("hour"), DimensionedFloatDtype("hour")), ]) def test__get_dtype(input_param, result): assert com._get_dtype(input_param) == result @@ -604,6 +609,9 @@ def test__get_dtype_fails(input_param): (1, type(None)), (1.2, type(None)), (pd.DataFrame([1, 2]), type(None)), # composite dtype + ('dimensionedFloat[second]', DimensionedFloatDtypeType), + (DimensionedFloatDtype("hour"), DimensionedFloatDtypeType), + ]) def test__get_dtype_type(input_param, result): assert com._get_dtype_type(input_param) == result diff --git a/pandas/tests/dtypes/test_costum.py b/pandas/tests/dtypes/test_costum.py new file mode 100644 index 0000000000000..5750f45117737 --- /dev/null +++ b/pandas/tests/dtypes/test_costum.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +import pytest + +import numpy as np +from numpy.testing import assert_allclose + +import pandas as pd +from pandas.core.dtypes.costum_dtypes import NumpyDtypeWithMetadata, AlwaysSame +from pandas.core.dtypes.common import is_dtype_equal + +from .test_dtypes import Base + +def test_alwys_same_series_construction(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + # The '==' operator for the whole series will be tested elsewhere + for i in range(0,6): + assert a[i]==42 + +def test_always_same_with_series_eq(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + assert np.all(a == a) + assert np.all(a == pd.Series([42,42,42,42,42,42], dtype = int)) + assert np.all(a != pd.Series([1,2,4,4,4,4], dtype = AlwaysSame(41))) + assert np.all(a == pd.Series([1,2,4,4,4,4], dtype = AlwaysSame(42))) + + +def test_always_same_with_series_add(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + assert (a == a + a).all() + + +def test_always_same_with_series_add_mixing_types(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + b = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(43)) + c = pd.Series([1,2,3,4,5,6], dtype = int) + pytest.raises(TypeError, + lambda: a + b) + pytest.raises(TypeError, + lambda: b + a) + assert (a + c == a).all() + assert (c + a == a).all() + + +class TestAlwaysSame(Base): + def create(self): + return AlwaysSame(42) + + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = AlwaysSame(42) + assert dtype == dtype2 + assert dtype2 == dtype + assert dtype is dtype2 + assert dtype2 is dtype + assert hash(dtype) == hash(dtype2) + + +class TestNumpyDtypeWithMetadata: + + def test_is_not_dtype(self): + assert not NumpyDtypeWithMetadata.is_dtype(None) + assert not NumpyDtypeWithMetadata.is_dtype(np.float64) + assert NumpyDtypeWithMetadata.is_dtype(AlwaysSame('Beer'))