From 517f068c7fda53a81109094d7ccd68336c6faad5 Mon Sep 17 00:00:00 2001 From: Bernhard Thiel Date: Wed, 2 Aug 2017 06:58:43 +0200 Subject: [PATCH 1/9] Started to implement extentionDtype for units --- pandas/core/dtypes/common.py | 27 +++++++++++++++++++-- pandas/core/ops.py | 38 ++++++++++++++++++++++++++++++ pandas/tests/dtypes/test_cast.py | 10 ++++++++ pandas/tests/dtypes/test_common.py | 10 ++++++-- 4 files changed, 81 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 37f99bd344e6c..0a11a813af115 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,6 +9,7 @@ PeriodDtype, PeriodDtypeType, IntervalDtype, IntervalDtypeType, ExtensionDtype) +from .units import DimensionedFloatDtype, DimensionedFloatDtypeType from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, @@ -507,6 +508,10 @@ def is_categorical_dtype(arr_or_dtype): return False return CategoricalDtype.is_dtype(arr_or_dtype) +def is_dimensionedFloat_dtype(arr_or_dtype): + if arr_or_dtype is None: + return False + return DimensionedFloatDtype.is_dtype(arr_or_dtype) def is_string_dtype(arr_or_dtype): """ @@ -682,11 +687,13 @@ def is_dtype_equal(source, target): """ try: + print("Comparing {} and {}".format(source, target)) source = _get_dtype(source) target = _get_dtype(target) + print("Comparing {} and {}".format(source, target)) return source == target - except (TypeError, AttributeError): - + except (TypeError, AttributeError) as e: + print(e) # invalid comparison # object == category will hit this return False @@ -1717,6 +1724,9 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, IntervalDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, DimensionedFloatDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -1726,6 +1736,8 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) + elif arr_or_dtype.startswith("dimensionedFloat"): + return DimensionedFloatDtype.construct_from_string(arr_or_dtype) elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): return arr_or_dtype.dtype @@ -1762,6 +1774,8 @@ def _get_dtype_type(arr_or_dtype): return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType + elif isinstance(arr_or_dtype, DimensionedFloatDtype): + return DimensionedFloatDtypeType elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtypeType @@ -1771,6 +1785,8 @@ def _get_dtype_type(arr_or_dtype): return PeriodDtypeType elif is_interval_dtype(arr_or_dtype): return IntervalDtypeType + elif arr_or_dtype.startswith("dimensionedFloat"): + return DimensionedFloatDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -1879,6 +1895,8 @@ def pandas_dtype(dtype): if isinstance(dtype, DatetimeTZDtype): return dtype + elif isinstance(dtype, DimensionedFloatDtype): + return dtype elif isinstance(dtype, PeriodDtype): return dtype elif isinstance(dtype, CategoricalDtype): @@ -1904,6 +1922,11 @@ def pandas_dtype(dtype): except TypeError: pass + elif dtype.startswith('dimensionedFloat['): + try: + return DimensionedFloatDtype.construct_from_string(dtype) + except TypeError: + pass try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 4e08e1483d617..2ab0a091e73f3 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -24,11 +24,13 @@ from pandas.errors import PerformanceWarning from pandas.core.common import _values_from_object, _maybe_match_name from pandas.core.dtypes.missing import notna, isna +from pandas.core.dtypes.units import unit_registry from pandas.core.dtypes.common import ( needs_i8_conversion, is_datetimelike_v_numeric, is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, + is_dimensionedFloat_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_bool_dtype, is_datetimetz, is_list_like, @@ -330,11 +332,47 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) + print("dtypes l,r:", left.dtype, right.dtype) + if is_dimensionedFloat_dtype(left.dtype) or is_dimensionedFloat_dtype(right.dtype): + if (is_datetime_lhs or is_timedelta_lhs): + raise TypeError("Cannot mix DimensionedFloat and Time for operations") + print("getDimFloatOP") + return _DimFloatOp(left, right, name, na_op) if not (is_datetime_lhs or is_timedelta_lhs): + print("Get normal OP") return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op) +class _DimFloatOp(_Op): + def __init__(self, left, right, name, na_op): + super(_DimFloatOp, self).__init__(left, right, name, na_op) + # Get the type of the calculation's result. + self.dtype = self._get_target_unit(left, right) + # + self.lvalues = self._with_unit(data) + self.rvalues = self._with_unit(data) + + @staticmethod + def _get_target_dtype(left, right, name): + # Perform the operation on 1* the unit, to quickly get the resulting unit + # Raises an Error, if the units are incompatible + left_unit = self._get_unit(left) + right_unit = self._get_unit(right) + calc_result = (getattr(1*left.unit, name)(1*right.unit)) + if isinstance( calc_result, bool): + return bool + else: + return DimensionedFloatDtype(str(calc_result.unit)) + print("Target dtype", target_unit) + return target_unit + @staticmethod + def _with_unit(data): + if hasattr(data, "unit"): + return data.unit*data + return data + + class _TimeOp(_Op): """ diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..366bea176719b 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -26,6 +26,7 @@ CategoricalDtype, DatetimeTZDtype, PeriodDtype) +from pandas.core.dtypes.units import DimensionedFloatDtype from pandas.core.dtypes.common import ( is_dtype_equal) from pandas.util import testing as tm @@ -398,6 +399,15 @@ def test_datetimetz_dtype(self): assert find_common_type([dtype, dtype2]) == np.object assert find_common_type([dtype2, dtype]) == np.object + def test_dimensionedFloat_dtype(self): + dtype = DimensionedFloatDtype('meter') + assert find_common_type([dtype, dtype]) == 'dimensionedFloat[meter]' + + for dtype2 in [DimensionedFloatDtype(unit='seconds'), + np.object, np.int64]: + assert find_common_type([dtype, dtype2]) == np.object + assert find_common_type([dtype2, dtype]) == np.object + def test_period_dtype(self): dtype = PeriodDtype(freq='D') assert find_common_type([dtype, dtype]) == 'period[D]' diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4..a2ef5243053da 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -9,7 +9,7 @@ import pandas.core.dtypes.common as com import pandas.util.testing as tm - +from pandas.core.dtypes.units import DimensionedFloatDtype, DimensionedFloatDtypeType class TestPandasDtype(object): @@ -62,7 +62,8 @@ def test_period_dtype(self): integer=np.dtype(np.int64), float=np.dtype(np.float64), object=np.dtype(np.object), - category=com.pandas_dtype('category')) + category=com.pandas_dtype('category'), + dimensionedDT=DimensionedFloatDtype("meter")) @pytest.mark.parametrize('name1,dtype1', @@ -558,6 +559,8 @@ def test_is_complex_dtype(): (PeriodDtype(freq='D'), PeriodDtype(freq='D')), ('period[D]', PeriodDtype(freq='D')), (IntervalDtype(), IntervalDtype()), + ('dimensionedFloat[second]', DimensionedFloatDtype("second")), + (DimensionedFloatDtype("hour"), DimensionedFloatDtype("hour")), ]) def test__get_dtype(input_param, result): assert com._get_dtype(input_param) == result @@ -604,6 +607,9 @@ def test__get_dtype_fails(input_param): (1, type(None)), (1.2, type(None)), (pd.DataFrame([1, 2]), type(None)), # composite dtype + ('dimensionedFloat[second]', DimensionedFloatDtypeType), + (DimensionedFloatDtype("hour"), DimensionedFloatDtypeType), + ]) def test__get_dtype_type(input_param, result): assert com._get_dtype_type(input_param) == result From 10e04d6135ad11cb2f1df01bf06dfc341ece56a9 Mon Sep 17 00:00:00 2001 From: Bernhard Thiel Date: Wed, 2 Aug 2017 07:04:16 +0200 Subject: [PATCH 2/9] Added files missing from prev. commit --- pandas/core/dtypes/units.py | 84 +++++++++++++++++++++++++++++++ pandas/tests/dtypes/test_units.py | 73 +++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 pandas/core/dtypes/units.py create mode 100644 pandas/tests/dtypes/test_units.py diff --git a/pandas/core/dtypes/units.py b/pandas/core/dtypes/units.py new file mode 100644 index 0000000000000..94ab1b91a2a70 --- /dev/null +++ b/pandas/core/dtypes/units.py @@ -0,0 +1,84 @@ + + +import pint + +import numpy as np + +from pandas import compat +from .dtypes import ExtensionDtype + +unit_registry = pint.UnitRegistry() + +class DimensionedFloatDtypeType(type): + """ + The type of UnitDtype. + """ + pass + +class DimensionedFloatDtype(ExtensionDtype): + + """ + A dtype for holding float64 nubers with units + + THIS IS NOT A REAL NUMPY DTYPE + """ + type = DimensionedFloatDtypeType + _metadata = ['unit'] + _cache = {} + kind = "f" + str="f8" + base = np.dtype('f8') + + def __new__(cls, unit=None): + """ Create a new unit if needed, otherwise return from the cache + + Parameters + ---------- + unit : string unit that this represents. + """ + if unit is None: + # we are called as an empty constructor + # generally for pickle compat + return object.__new__(cls) + + # Assume unit is a string. + unit_object = getattr(unit_registry, unit) #Raises TypeError if Unit is not a string. + + # set/retrieve from cache + try: + return cls._cache[unit] + except KeyError: + u = object.__new__(cls) + u.unit = unit_object + cls._cache[unit] = u + return u + def __hash__(self): + return hash(str(self)) + + def __unicode__(self): + return "dimensionedFloat[{unit}]".format(unit=str(self.unit)) + + def __setstate__(self, state): + # Use the same unit but from our registry, not the pickled unit + # Mixing units from different registries causes errors. + self.unit = getattr(unit_registry, str(state["unit"])) + + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == str(self) + if not isinstance(other, DimensionedFloatDtype): + return NotImplemented + return self.unit==other.unit + + @classmethod + def construct_from_string(cls, string): + """ attempt to construct this type from a string, raise a TypeError if + it's not possible """ + try: + typename, unit = string.split("[") + if unit[-1]=="]" and typename == "dimensionedFloat": + return cls(unit[:-1]) + except: + pass + raise TypeError("cannot construct a DimensionedFloatDtype") diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py new file mode 100644 index 0000000000000..27e8612d9fc08 --- /dev/null +++ b/pandas/tests/dtypes/test_units.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +import pytest + +import numpy as np + +import pint + +from pandas.util import testing as tm +import pandas as pd +from pandas.core.dtypes.units import DimensionedFloatDtype +from pandas.core.dtypes.common import is_dtype_equal + +from .test_dtypes import Base + +def test_construction_string(): + """ + Assert that ainstances of UnitDType have + """ + a = DimensionedFloatDtype("meter") + assert isinstance(a, DimensionedFloatDtype) + +def test_equality(): + assert DimensionedFloatDtype("meter")== DimensionedFloatDtype("meter") + a = DimensionedFloatDtype("meter") + assert a == a + + +def test_with_dataframe(): + a = pd.Series([1,2,3,4,5], dtype=DimensionedFloatDtype("meter")) + b = a+a + assert b[0]==6 + print b.dtype + assert b.dtype + #== DimensionedFloatDtype("meter*meter") + +class TestUnitDtype(Base): + def create(self): + return DimensionedFloatDtype("meter") + + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = DimensionedFloatDtype('meter') + assert dtype == dtype2 + assert dtype2 == dtype + assert dtype is dtype2 + assert dtype2 is dtype + assert hash(dtype) == hash(dtype2) + + def test_construction(self): + pytest.raises(Exception, #pint.UndefinedUnitError + lambda: DimensionedFloatDtype('thisIsNotAUnit')) + + + def test_construction_from_string(self): + result = DimensionedFloatDtype.construct_from_string( + 'dimensionedFloat[meter]') + assert is_dtype_equal(self.dtype, result) + pytest.raises(TypeError, + lambda: DimensionedFloatDtype.construct_from_string('foo')) + + def test_is_dtype(self): + assert not DimensionedFloatDtype.is_dtype(None) + assert DimensionedFloatDtype.is_dtype(self.dtype) + assert DimensionedFloatDtype.is_dtype('dimensionedFloat[meter]') + assert not DimensionedFloatDtype.is_dtype('foo') + assert DimensionedFloatDtype.is_dtype(DimensionedFloatDtype('hours')) + assert not DimensionedFloatDtype.is_dtype(np.float64) + + def test_equality(self): + assert is_dtype_equal(self.dtype, 'dimensionedFloat[meter]') + assert not is_dtype_equal(self.dtype, 'foo') From 59daf76052ea33980de81a6bf16c496270e4678a Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 10:56:42 +0200 Subject: [PATCH 3/9] Basic concept seems to work --- pandas/core/dtypes/cast.py | 6 +++-- pandas/core/dtypes/common.py | 4 ++++ pandas/core/ops.py | 37 +++++++++++++++++++------------ pandas/core/series.py | 28 +++++++++++++++++++---- pandas/core/tools/datetimes.py | 2 ++ pandas/io/formats/format.py | 2 ++ pandas/tests/dtypes/test_units.py | 29 ++++++++++++++++++++---- 7 files changed, 84 insertions(+), 24 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 723e4f70da4e9..075b902cb3e15 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -910,9 +910,9 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) - + print("in maybe_cast_to_datetime.") if is_datetime64 or is_datetime64tz or is_timedelta64: - + print("in maybe_cast_to_datetime: is time") # force the dtype if needed msg = ("Passing in '{dtype}' dtype with no frequency is " "deprecated and will raise in a future version. " @@ -975,6 +975,8 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): + print("in maybe_cast_to_datetime: Is datetimevalue") + if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0a11a813af115..6133b470fd03f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -511,6 +511,8 @@ def is_categorical_dtype(arr_or_dtype): def is_dimensionedFloat_dtype(arr_or_dtype): if arr_or_dtype is None: return False + print("is dimensioned Float dtype: ", type(arr_or_dtype)) + print (DimensionedFloatDtype.is_dtype(arr_or_dtype)) return DimensionedFloatDtype.is_dtype(arr_or_dtype) def is_string_dtype(arr_or_dtype): @@ -1622,6 +1624,8 @@ def is_extension_type(arr): return True elif is_datetimetz(arr): return True + elif is_dimensionedFloat_dtype(arr): + return True return False diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 2ab0a091e73f3..65df109f97f50 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -24,7 +24,7 @@ from pandas.errors import PerformanceWarning from pandas.core.common import _values_from_object, _maybe_match_name from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.units import unit_registry +from pandas.core.dtypes.units import unit_registry, DimensionedFloatDtype from pandas.core.dtypes.common import ( needs_i8_conversion, is_datetimelike_v_numeric, @@ -346,31 +346,40 @@ def get_op(cls, left, right, name, na_op): class _DimFloatOp(_Op): def __init__(self, left, right, name, na_op): + print("DIMFLOATOP") super(_DimFloatOp, self).__init__(left, right, name, na_op) # Get the type of the calculation's result. - self.dtype = self._get_target_unit(left, right) + self.dtype = self._get_target_dtype(left, right, name) + print("target_dtype", self.dtype) # - self.lvalues = self._with_unit(data) - self.rvalues = self._with_unit(data) - - @staticmethod - def _get_target_dtype(left, right, name): + self.lvalues = self._with_unit(left.values) + self.rvalues = self._with_unit(right.values) + print ("lvals, rvals", type(self.lvalues), type(self.rvalues)) + @classmethod + def _get_target_dtype(cls, left, right, name): # Perform the operation on 1* the unit, to quickly get the resulting unit # Raises an Error, if the units are incompatible - left_unit = self._get_unit(left) - right_unit = self._get_unit(right) - calc_result = (getattr(1*left.unit, name)(1*right.unit)) + left_unit = cls._get_unit(left.values) + right_unit = cls._get_unit(right.values) + print("Units: l,r:", left_unit, right_unit) + calc_result = (getattr(1*left_unit, name)(1*right_unit)) if isinstance( calc_result, bool): return bool else: - return DimensionedFloatDtype(str(calc_result.unit)) + return DimensionedFloatDtype(str(calc_result.units)) print("Target dtype", target_unit) return target_unit @staticmethod def _with_unit(data): - if hasattr(data, "unit"): - return data.unit*data - return data + print("with_unit: ", type(data)) + if hasattr(data.dtype, "unit"): + return data.dtype.unit*data.values + return data.values + @staticmethod + def _get_unit(data): + if hasattr(data.dtype, "unit"): + return data.dtype.unit + return unit_registry.dimensionless diff --git a/pandas/core/series.py b/pandas/core/series.py index 60d268c89a9d7..0dab452714bce 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,6 +23,7 @@ is_datetimelike, is_datetime64tz_dtype, is_timedelta64_dtype, + is_dimensionedFloat_dtype, is_list_like, is_hashable, is_iterator, @@ -54,6 +55,7 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor +from pandas.core.dimensioned import Dimensional import pandas.core.strings as strings from pandas.core.indexes.accessors import ( maybe_to_datetimelike, CombinedDatetimelikeProperties) @@ -2990,7 +2992,7 @@ def _sanitize_array(data, index, dtype=None, copy=False, if dtype is not None: dtype = pandas_dtype(dtype) - + print("In sanitize array: dtype done") if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): @@ -3005,22 +3007,34 @@ def _try_cast(arr, take_fast_path): if take_fast_path: if maybe_castable(arr) and not copy and dtype is None: return arr + print("In _try_cast") try: subarr = maybe_cast_to_datetime(arr, dtype) - if not is_extension_type(subarr): + print("In _try_cast: testing extention type") + if not is_extension_type(subarr) and not is_extension_type(dtype): + print("In _try_cast: not extention type") subarr = np.array(subarr, dtype=dtype, copy=copy) - except (ValueError, TypeError): + else: + if is_dimensionedFloat_dtype(dtype): + subarr = Dimensional(np.asarray(subarr, dtype = "float64"), dtype = dtype) + print("In _try_cast: is extension type") + except (ValueError, TypeError) as e: + print("In _try_cast: exception", e) + if is_categorical_dtype(dtype): subarr = Categorical(arr) elif dtype is not None and raise_cast_failure: + print("In _try_cast: raise") raise else: subarr = np.array(arr, dtype=object, copy=copy) + print("try_cast returning") return subarr # GH #846 if isinstance(data, (np.ndarray, Index, Series)): + print("In sanitize array: is array-like") if dtype is not None: subarr = np.array(data, copy=False) @@ -3053,9 +3067,13 @@ def _try_cast(arr, take_fast_path): return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: + print("In sanitize array: is list-like") if dtype is not None: try: + print("In sanitize array: before _try_cast") subarr = _try_cast(data, False) + print("In sanitize array: after _try_cast") + except Exception: if raise_cast_failure: # pragma: no cover raise @@ -3084,7 +3102,7 @@ def create_from_value(value, index, dtype): subarr.fill(value) return subarr - + print("subarr has type ", type(subarr)) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover @@ -3118,12 +3136,14 @@ def create_from_value(value, index, dtype): raise Exception('Data must be 1-dimensional') else: subarr = _asarray_tuplesafe(data, dtype=dtype) + print("subarr has type ", type(subarr), subarr.dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, compat.string_types): subarr = np.array(data, dtype=object, copy=copy) + print("returning subarr of dtype ", subarr.dtype) return subarr diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a1f323aff7c1a..c294bd09af634 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -336,6 +336,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 2 1960-01-04 """ + + print("to datetime. Box is {}".format(box)) from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2b322431bd301..1b51699a68fa4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1747,6 +1747,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter + if hasattr(values, "values"): + values = values.values if space is None: space = get_option("display.column_space") diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py index 27e8612d9fc08..da89fd959ab41 100644 --- a/pandas/tests/dtypes/test_units.py +++ b/pandas/tests/dtypes/test_units.py @@ -3,9 +3,10 @@ import pytest import numpy as np +import numpy.testing import pint - +import numpy as np from pandas.util import testing as tm import pandas as pd from pandas.core.dtypes.units import DimensionedFloatDtype @@ -26,12 +27,32 @@ def test_equality(): assert a == a -def test_with_dataframe(): +def test_with_series(): + print("=== constructing timezone") + a = pd.Series([1,2,3,4,5], dtype="datetime64[ns]") + + print("=== constructing series") a = pd.Series([1,2,3,4,5], dtype=DimensionedFloatDtype("meter")) + print("=== sum") b = a+a - assert b[0]==6 + print("=== print b") + print(b) + print("=== assert b[0]==2") + assert b[0]==2 + print("=== b.dtype") print b.dtype - assert b.dtype + print("=== assert dtype==....") + assert b.dtype == DimensionedFloatDtype("meter") + c = pd.Series([5, 10, 50, 100, 500], dtype=DimensionedFloatDtype("centimeter")) + print("=== equality ") + print(a==a, type(a==a)) + print("== print") + print(a+c == pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) + print("a+c", a+c, (a+c).dtype) + print("=== final asserts") + numpy.testing.assert_allclose(a+c, pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) + numpy.testing.assert_allclose(a*c, pd.Series([5, 20, 150, 400, 2500.], dtype=DimensionedFloatDtype("meter*centimeter"))) + #== DimensionedFloatDtype("meter*meter") class TestUnitDtype(Base): From cfa7d4338c5bda81b8f5f410f94ba9e53ce72c50 Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 11:53:47 +0200 Subject: [PATCH 4/9] Removed debug-print statements --- pandas/core/dtypes/cast.py | 6 +++--- pandas/core/dtypes/common.py | 7 +------ pandas/core/ops.py | 11 +++-------- pandas/core/series.py | 17 ----------------- pandas/tests/dtypes/test_units.py | 19 +------------------ 5 files changed, 8 insertions(+), 52 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 075b902cb3e15..396cb4bd373a4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -910,9 +910,9 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) - print("in maybe_cast_to_datetime.") + if is_datetime64 or is_datetime64tz or is_timedelta64: - print("in maybe_cast_to_datetime: is time") + # force the dtype if needed msg = ("Passing in '{dtype}' dtype with no frequency is " "deprecated and will raise in a future version. " @@ -975,7 +975,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): - print("in maybe_cast_to_datetime: Is datetimevalue") + if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 6133b470fd03f..c49ada80ceb8d 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -511,8 +511,6 @@ def is_categorical_dtype(arr_or_dtype): def is_dimensionedFloat_dtype(arr_or_dtype): if arr_or_dtype is None: return False - print("is dimensioned Float dtype: ", type(arr_or_dtype)) - print (DimensionedFloatDtype.is_dtype(arr_or_dtype)) return DimensionedFloatDtype.is_dtype(arr_or_dtype) def is_string_dtype(arr_or_dtype): @@ -689,13 +687,10 @@ def is_dtype_equal(source, target): """ try: - print("Comparing {} and {}".format(source, target)) source = _get_dtype(source) target = _get_dtype(target) - print("Comparing {} and {}".format(source, target)) return source == target - except (TypeError, AttributeError) as e: - print(e) + except (TypeError, AttributeError): # invalid comparison # object == category will hit this return False diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 65df109f97f50..312928bc2993a 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -332,25 +332,21 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) - print("dtypes l,r:", left.dtype, right.dtype) if is_dimensionedFloat_dtype(left.dtype) or is_dimensionedFloat_dtype(right.dtype): if (is_datetime_lhs or is_timedelta_lhs): raise TypeError("Cannot mix DimensionedFloat and Time for operations") - print("getDimFloatOP") return _DimFloatOp(left, right, name, na_op) if not (is_datetime_lhs or is_timedelta_lhs): - print("Get normal OP") return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op) class _DimFloatOp(_Op): def __init__(self, left, right, name, na_op): - print("DIMFLOATOP") + super(_DimFloatOp, self).__init__(left, right, name, na_op) # Get the type of the calculation's result. self.dtype = self._get_target_dtype(left, right, name) - print("target_dtype", self.dtype) # self.lvalues = self._with_unit(left.values) self.rvalues = self._with_unit(right.values) @@ -361,17 +357,16 @@ def _get_target_dtype(cls, left, right, name): # Raises an Error, if the units are incompatible left_unit = cls._get_unit(left.values) right_unit = cls._get_unit(right.values) - print("Units: l,r:", left_unit, right_unit) calc_result = (getattr(1*left_unit, name)(1*right_unit)) if isinstance( calc_result, bool): return bool else: return DimensionedFloatDtype(str(calc_result.units)) - print("Target dtype", target_unit) + return target_unit @staticmethod def _with_unit(data): - print("with_unit: ", type(data)) + if hasattr(data.dtype, "unit"): return data.dtype.unit*data.values return data.values diff --git a/pandas/core/series.py b/pandas/core/series.py index 0dab452714bce..ea274dfe97d77 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2992,7 +2992,6 @@ def _sanitize_array(data, index, dtype=None, copy=False, if dtype is not None: dtype = pandas_dtype(dtype) - print("In sanitize array: dtype done") if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): @@ -3007,34 +3006,24 @@ def _try_cast(arr, take_fast_path): if take_fast_path: if maybe_castable(arr) and not copy and dtype is None: return arr - print("In _try_cast") - try: subarr = maybe_cast_to_datetime(arr, dtype) - print("In _try_cast: testing extention type") if not is_extension_type(subarr) and not is_extension_type(dtype): - print("In _try_cast: not extention type") subarr = np.array(subarr, dtype=dtype, copy=copy) else: if is_dimensionedFloat_dtype(dtype): subarr = Dimensional(np.asarray(subarr, dtype = "float64"), dtype = dtype) - print("In _try_cast: is extension type") except (ValueError, TypeError) as e: - print("In _try_cast: exception", e) - if is_categorical_dtype(dtype): subarr = Categorical(arr) elif dtype is not None and raise_cast_failure: - print("In _try_cast: raise") raise else: subarr = np.array(arr, dtype=object, copy=copy) - print("try_cast returning") return subarr # GH #846 if isinstance(data, (np.ndarray, Index, Series)): - print("In sanitize array: is array-like") if dtype is not None: subarr = np.array(data, copy=False) @@ -3067,12 +3056,9 @@ def _try_cast(arr, take_fast_path): return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: - print("In sanitize array: is list-like") if dtype is not None: try: - print("In sanitize array: before _try_cast") subarr = _try_cast(data, False) - print("In sanitize array: after _try_cast") except Exception: if raise_cast_failure: # pragma: no cover @@ -3102,7 +3088,6 @@ def create_from_value(value, index, dtype): subarr.fill(value) return subarr - print("subarr has type ", type(subarr)) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover @@ -3136,14 +3121,12 @@ def create_from_value(value, index, dtype): raise Exception('Data must be 1-dimensional') else: subarr = _asarray_tuplesafe(data, dtype=dtype) - print("subarr has type ", type(subarr), subarr.dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, compat.string_types): subarr = np.array(data, dtype=object, copy=copy) - print("returning subarr of dtype ", subarr.dtype) return subarr diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py index da89fd959ab41..d720f54105a3f 100644 --- a/pandas/tests/dtypes/test_units.py +++ b/pandas/tests/dtypes/test_units.py @@ -28,33 +28,16 @@ def test_equality(): def test_with_series(): - print("=== constructing timezone") - a = pd.Series([1,2,3,4,5], dtype="datetime64[ns]") - print("=== constructing series") + a = pd.Series([1,2,3,4,5], dtype="datetime64[ns]") a = pd.Series([1,2,3,4,5], dtype=DimensionedFloatDtype("meter")) - print("=== sum") b = a+a - print("=== print b") - print(b) - print("=== assert b[0]==2") assert b[0]==2 - print("=== b.dtype") - print b.dtype - print("=== assert dtype==....") assert b.dtype == DimensionedFloatDtype("meter") c = pd.Series([5, 10, 50, 100, 500], dtype=DimensionedFloatDtype("centimeter")) - print("=== equality ") - print(a==a, type(a==a)) - print("== print") - print(a+c == pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) - print("a+c", a+c, (a+c).dtype) - print("=== final asserts") numpy.testing.assert_allclose(a+c, pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) numpy.testing.assert_allclose(a*c, pd.Series([5, 20, 150, 400, 2500.], dtype=DimensionedFloatDtype("meter*centimeter"))) - #== DimensionedFloatDtype("meter*meter") - class TestUnitDtype(Base): def create(self): return DimensionedFloatDtype("meter") From 586fd2d1c97fd7e06e4a5577d34a087143d9c1b0 Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 12:01:17 +0200 Subject: [PATCH 5/9] Added missing file --- pandas/core/dimensioned.py | 263 +++++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 pandas/core/dimensioned.py diff --git a/pandas/core/dimensioned.py b/pandas/core/dimensioned.py new file mode 100644 index 0000000000000..382128efd046f --- /dev/null +++ b/pandas/core/dimensioned.py @@ -0,0 +1,263 @@ +from pandas.core.base import (PandasObject) +from pandas.util._decorators import cache_readonly +from pandas import compat +import numpy as np +class Dimensional(PandasObject): + """ + """ + + __array_priority__ = 10 + _typ = 'dimensional' + + #def __add__(self, other): + # return self.values+other.values + def __init__(self, values, dtype): + # TODO: Sanitize + self.values = values + self.dtype = dtype + @property + def _constructor(self): + return Dimensional + + def copy(self): + """ Copy constructor. """ + return self._constructor(self.values.copy(), self.dtype) + + def astype(self, dtype, copy=True): + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + + .. versionadded:: 0.19.0 + + """ + return np.array(self, dtype=dtype, copy=copy) + + @cache_readonly + def ndim(self): + """Number of dimensions """ + return self.values.ndim + + @cache_readonly + def size(self): + """ return the len of myself """ + return len(self) + + @property + def base(self): + """ compat, we are always our own object """ + return None + + # for Series/ndarray like compat + @property + def shape(self): + """ Shape of the Categorical. + + For internal compatibility with numpy arrays. + + Returns + ------- + shape : tuple + """ + + return tuple([len(self.values)]) + + + + def __array__(self, dtype=None): + """ + The numpy array interface. + + Returns + ------- + values : numpy array + A numpy array of either the specified dtype or, + if dtype==None (default), the same dtype as + categorical.categories.dtype + """ + if dtype: + return np.asarray(self.values, dtype) + return self.values + + @property + def T(self): + return self + + + def isna(self): + raise NotImplementedError + isnull = isna + + def notna(self): + """ + Inverse of isna + + Both missing values (-1 in .codes) and NA as a category are detected as + null. + + Returns + ------- + a boolean array of whether my values are not null + + See also + -------- + notna : top-level notna + notnull : alias of notna + Categorical.isna : boolean inverse of Categorical.notna + + """ + return ~self.isna() + notnull = notna + + def put(self, *args, **kwargs): + """ + Replace specific elements in the Categorical with given values. + """ + raise NotImplementedError(("'put' is not yet implemented " + "for Categorical")) + + def dropna(self): + raise NotImplementedError + + + + + def get_values(self): + """ Return the values. + + For internal compatibility with pandas formatting. + + Returns + ------- + values : numpy array + A numpy array of the same dtype as categorical.categories.dtype or + Index if datetime / periods + """ + return np.array(self) + + def ravel(self, order='C'): + """ Return a flattened (numpy) array. + + For internal compatibility with numpy arrays. + + Returns + ------- + raveled : numpy array + """ + return np.array(self) + + def view(self): + """Return a view of myself. + + For internal compatibility with numpy arrays. + + Returns + ------- + view : Categorical + Returns `self`! + """ + return self + + def to_dense(self): + """Return my 'dense' representation + + For internal compatibility with numpy arrays. + + Returns + ------- + dense : array + """ + return np.asarray(self) + + def fillna(self, value=None, method=None, limit=None): + """ Fill NA/NaN values using the specified method. + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + value : scalar + Value to use to fill holes (e.g. 0) + limit : int, default None + (Not implemented yet for Categorical!) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : Categorical with NA/NaN filled + """ + raise NotImplementedError + + + def _slice(self, slicer): + """ Return a slice of myself. + + For internal compatibility with numpy arrays. + """ + + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if isinstance(slicer, tuple) and len(slicer) == 2: + if not is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + return self._constructor(self.values[slicer], self.dtype) + + def __len__(self): + """The length of this Categorical.""" + return len(self.values) + + def __iter__(self): + """Returns an Iterator over the values of this Categorical.""" + return iter(self.get_values()) + + def _tidy_repr(self, max_vals=10, footer=True): + """ a short repr displaying only max_vals and an optional (but default + footer) + """ + num = max_vals // 2 + head = self[:num]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) + + result = '%s, ..., %s' % (head[:-1], tail[1:]) + if footer: + result = '%s\n%s' % (result, self._repr_footer()) + + return compat.text_type(result) + + def _repr_footer(self): + + return 'Length: %d' % (len(self)) + + def _get_repr(self, length=True, na_rep='NaN', footer=True): + return "Dimensional {}".format(self.__array__()) + # TODO: Implement properly + + def __unicode__(self): + """ Unicode representation. """ + # TODO: implement + return self._tidy_repr() + + + def __getitem__(self, key): + """ Return an item. """ + return Dimensional(values=self.values[key], dtype = self.dtype) + + def __setitem__(self, key, value): + raise NotImplementedError From 541cc8dd67dce718ff6fa7ddb511c25a614e07c8 Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 12:36:10 +0200 Subject: [PATCH 6/9] pep8 --- pandas/core/dimensioned.py | 32 ++++++-------------------- pandas/core/dtypes/cast.py | 2 -- pandas/core/dtypes/common.py | 2 ++ pandas/core/dtypes/units.py | 12 +++++----- pandas/core/ops.py | 21 +++++++++-------- pandas/core/series.py | 5 +++-- pandas/tests/dtypes/test_common.py | 4 +++- pandas/tests/dtypes/test_units.py | 36 +++++++++++++++++------------- 8 files changed, 54 insertions(+), 60 deletions(-) diff --git a/pandas/core/dimensioned.py b/pandas/core/dimensioned.py index 382128efd046f..b6a83d278fa78 100644 --- a/pandas/core/dimensioned.py +++ b/pandas/core/dimensioned.py @@ -1,7 +1,11 @@ +import numpy as np + from pandas.core.base import (PandasObject) from pandas.util._decorators import cache_readonly from pandas import compat -import numpy as np +from pandas.core.common import is_null_slice + + class Dimensional(PandasObject): """ """ @@ -9,12 +13,11 @@ class Dimensional(PandasObject): __array_priority__ = 10 _typ = 'dimensional' - #def __add__(self, other): - # return self.values+other.values def __init__(self, values, dtype): # TODO: Sanitize self.values = values self.dtype = dtype + @property def _constructor(self): return Dimensional @@ -26,17 +29,6 @@ def copy(self): def astype(self, dtype, copy=True): """ Coerce this type to another dtype - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and dtype is categorical, the original - object is returned. - - .. versionadded:: 0.19.0 - """ return np.array(self, dtype=dtype, copy=copy) @@ -66,11 +58,8 @@ def shape(self): ------- shape : tuple """ - return tuple([len(self.values)]) - - def __array__(self, dtype=None): """ The numpy array interface. @@ -90,7 +79,6 @@ def __array__(self, dtype=None): def T(self): return self - def isna(self): raise NotImplementedError isnull = isna @@ -126,9 +114,6 @@ def put(self, *args, **kwargs): def dropna(self): raise NotImplementedError - - - def get_values(self): """ Return the values. @@ -202,7 +187,6 @@ def fillna(self, value=None, method=None, limit=None): """ raise NotImplementedError - def _slice(self, slicer): """ Return a slice of myself. @@ -242,7 +226,6 @@ def _tidy_repr(self, max_vals=10, footer=True): return compat.text_type(result) def _repr_footer(self): - return 'Length: %d' % (len(self)) def _get_repr(self, length=True, na_rep='NaN', footer=True): @@ -254,10 +237,9 @@ def __unicode__(self): # TODO: implement return self._tidy_repr() - def __getitem__(self, key): """ Return an item. """ - return Dimensional(values=self.values[key], dtype = self.dtype) + return Dimensional(values=self.values[key], dtype=self.dtype) def __setitem__(self, key, value): raise NotImplementedError diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 396cb4bd373a4..723e4f70da4e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -975,8 +975,6 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): - - if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c49ada80ceb8d..001bed0ae5f16 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -508,11 +508,13 @@ def is_categorical_dtype(arr_or_dtype): return False return CategoricalDtype.is_dtype(arr_or_dtype) + def is_dimensionedFloat_dtype(arr_or_dtype): if arr_or_dtype is None: return False return DimensionedFloatDtype.is_dtype(arr_or_dtype) + def is_string_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of the string dtype. diff --git a/pandas/core/dtypes/units.py b/pandas/core/dtypes/units.py index 94ab1b91a2a70..f5de32b507b23 100644 --- a/pandas/core/dtypes/units.py +++ b/pandas/core/dtypes/units.py @@ -9,12 +9,14 @@ unit_registry = pint.UnitRegistry() + class DimensionedFloatDtypeType(type): """ The type of UnitDtype. """ pass + class DimensionedFloatDtype(ExtensionDtype): """ @@ -26,7 +28,7 @@ class DimensionedFloatDtype(ExtensionDtype): _metadata = ['unit'] _cache = {} kind = "f" - str="f8" + str = "f8" base = np.dtype('f8') def __new__(cls, unit=None): @@ -42,7 +44,7 @@ def __new__(cls, unit=None): return object.__new__(cls) # Assume unit is a string. - unit_object = getattr(unit_registry, unit) #Raises TypeError if Unit is not a string. + unit_object = getattr(unit_registry, unit) # TypeError if unit!=string # set/retrieve from cache try: @@ -52,6 +54,7 @@ def __new__(cls, unit=None): u.unit = unit_object cls._cache[unit] = u return u + def __hash__(self): return hash(str(self)) @@ -63,13 +66,12 @@ def __setstate__(self, state): # Mixing units from different registries causes errors. self.unit = getattr(unit_registry, str(state["unit"])) - def __eq__(self, other): if isinstance(other, compat.string_types): return other == str(self) if not isinstance(other, DimensionedFloatDtype): return NotImplemented - return self.unit==other.unit + return self.unit == other.unit @classmethod def construct_from_string(cls, string): @@ -77,7 +79,7 @@ def construct_from_string(cls, string): it's not possible """ try: typename, unit = string.split("[") - if unit[-1]=="]" and typename == "dimensionedFloat": + if unit[-1] == "]" and typename == "dimensionedFloat": return cls(unit[:-1]) except: pass diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 312928bc2993a..ddf4e5b8889b3 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -332,15 +332,18 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) - if is_dimensionedFloat_dtype(left.dtype) or is_dimensionedFloat_dtype(right.dtype): + if (is_dimensionedFloat_dtype(left.dtype) or + is_dimensionedFloat_dtype(right.dtype)): if (is_datetime_lhs or is_timedelta_lhs): - raise TypeError("Cannot mix DimensionedFloat and Time for operations") + raise TypeError("Cannot mix DimensionedFloat and " + "Time for operations") return _DimFloatOp(left, right, name, na_op) if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op) + class _DimFloatOp(_Op): def __init__(self, left, right, name, na_op): @@ -351,25 +354,26 @@ def __init__(self, left, right, name, na_op): self.lvalues = self._with_unit(left.values) self.rvalues = self._with_unit(right.values) print ("lvals, rvals", type(self.lvalues), type(self.rvalues)) + @classmethod def _get_target_dtype(cls, left, right, name): - # Perform the operation on 1* the unit, to quickly get the resulting unit + # Perform the operation on 1* the unit, + # to quickly get the resulting unit # Raises an Error, if the units are incompatible left_unit = cls._get_unit(left.values) right_unit = cls._get_unit(right.values) - calc_result = (getattr(1*left_unit, name)(1*right_unit)) - if isinstance( calc_result, bool): + calc_result = (getattr(1 * left_unit, name)(1 * right_unit)) + if isinstance(calc_result, bool): return bool else: return DimensionedFloatDtype(str(calc_result.units)) - return target_unit @staticmethod def _with_unit(data): - if hasattr(data.dtype, "unit"): - return data.dtype.unit*data.values + return data.dtype.unit * data.values return data.values + @staticmethod def _get_unit(data): if hasattr(data.dtype, "unit"): @@ -377,7 +381,6 @@ def _get_unit(data): return unit_registry.dimensionless - class _TimeOp(_Op): """ Wrapper around Series datetime/time/timedelta arithmetic operations. diff --git a/pandas/core/series.py b/pandas/core/series.py index ea274dfe97d77..0ad726ead1cfd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3012,8 +3012,9 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) else: if is_dimensionedFloat_dtype(dtype): - subarr = Dimensional(np.asarray(subarr, dtype = "float64"), dtype = dtype) - except (ValueError, TypeError) as e: + subarr = Dimensional(np.asarray(subarr, dtype="float64"), + dtype=dtype) + except (ValueError, TypeError): if is_categorical_dtype(dtype): subarr = Categorical(arr) elif dtype is not None and raise_cast_failure: diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index a2ef5243053da..74e9ca5b84ebb 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -9,7 +9,9 @@ import pandas.core.dtypes.common as com import pandas.util.testing as tm -from pandas.core.dtypes.units import DimensionedFloatDtype, DimensionedFloatDtypeType +from pandas.core.dtypes.units import (DimensionedFloatDtype, + DimensionedFloatDtypeType) + class TestPandasDtype(object): diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py index d720f54105a3f..60dd5b695864f 100644 --- a/pandas/tests/dtypes/test_units.py +++ b/pandas/tests/dtypes/test_units.py @@ -3,17 +3,15 @@ import pytest import numpy as np -import numpy.testing +from numpy.testing import assert_allclose -import pint -import numpy as np -from pandas.util import testing as tm import pandas as pd from pandas.core.dtypes.units import DimensionedFloatDtype from pandas.core.dtypes.common import is_dtype_equal from .test_dtypes import Base + def test_construction_string(): """ Assert that ainstances of UnitDType have @@ -21,22 +19,29 @@ def test_construction_string(): a = DimensionedFloatDtype("meter") assert isinstance(a, DimensionedFloatDtype) + def test_equality(): - assert DimensionedFloatDtype("meter")== DimensionedFloatDtype("meter") + assert DimensionedFloatDtype("meter") == DimensionedFloatDtype("meter") a = DimensionedFloatDtype("meter") assert a == a def test_with_series(): - a = pd.Series([1,2,3,4,5], dtype="datetime64[ns]") - a = pd.Series([1,2,3,4,5], dtype=DimensionedFloatDtype("meter")) - b = a+a - assert b[0]==2 - assert b.dtype == DimensionedFloatDtype("meter") - c = pd.Series([5, 10, 50, 100, 500], dtype=DimensionedFloatDtype("centimeter")) - numpy.testing.assert_allclose(a+c, pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) - numpy.testing.assert_allclose(a*c, pd.Series([5, 20, 150, 400, 2500.], dtype=DimensionedFloatDtype("meter*centimeter"))) + a = pd.Series([1, 2, 3, 4, 5], dtype="datetime64[ns]") + a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) + b = a + a + assert b[0] == 2 + assert b.dtype == DimensionedFloatDtype("meter") + c = pd.Series([5, 10, 50, 100, 500], + dtype=DimensionedFloatDtype("centimeter")) + assert_allclose(a + c, + pd.Series([1.05, 2.1, 3.5, 5, 10.], + dtype=DimensionedFloatDtype("meter"))) + assert_allclose(a * c, + pd.Series([5, 20, 150, 400, 2500.], + dtype=DimensionedFloatDtype("meter*centimeter"))) + class TestUnitDtype(Base): def create(self): @@ -53,16 +58,15 @@ def test_hash_vs_equality(self): assert hash(dtype) == hash(dtype2) def test_construction(self): - pytest.raises(Exception, #pint.UndefinedUnitError + pytest.raises(Exception, # pint.UndefinedUnitError lambda: DimensionedFloatDtype('thisIsNotAUnit')) - def test_construction_from_string(self): result = DimensionedFloatDtype.construct_from_string( 'dimensionedFloat[meter]') assert is_dtype_equal(self.dtype, result) pytest.raises(TypeError, - lambda: DimensionedFloatDtype.construct_from_string('foo')) + lambda: DimensionedFloatDtype.construct_from_string('foo')) def test_is_dtype(self): assert not DimensionedFloatDtype.is_dtype(None) From 7bfbee2b72a90bbd9e97fcc4454f89f008f76359 Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 15:05:03 +0200 Subject: [PATCH 7/9] Added pint to travis requirenments. (In the future, we need to remove the dependency on pint) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 897d31cf23a3b..9aed640406733 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,7 +104,7 @@ before_install: - git tag - ci/before_install_travis.sh - export DISPLAY=":99.0" - + - pip install pint install: - echo "install start" - ci/prep_cython_cache.sh From b71b77be811cbc1b609c3f466de5d9fb45dcbe71 Mon Sep 17 00:00:00 2001 From: "Bernhard C. Thiel" Date: Wed, 2 Aug 2017 23:25:19 +0200 Subject: [PATCH 8/9] Let Series store the dtype and get rig of Dimensional class --- pandas/core/categorical.py | 1 - pandas/core/dimensioned.py | 245 ------------------------------ pandas/core/ops.py | 22 ++- pandas/core/series.py | 20 ++- pandas/tests/dtypes/test_units.py | 21 ++- 5 files changed, 46 insertions(+), 263 deletions(-) delete mode 100644 pandas/core/dimensioned.py diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1392ad2f011db..2530b3e538aa1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -249,7 +249,6 @@ class Categorical(PandasObject): _typ = 'categorical' def __init__(self, values, categories=None, ordered=False, fastpath=False): - self._validate_ordered(ordered) if fastpath: diff --git a/pandas/core/dimensioned.py b/pandas/core/dimensioned.py deleted file mode 100644 index b6a83d278fa78..0000000000000 --- a/pandas/core/dimensioned.py +++ /dev/null @@ -1,245 +0,0 @@ -import numpy as np - -from pandas.core.base import (PandasObject) -from pandas.util._decorators import cache_readonly -from pandas import compat -from pandas.core.common import is_null_slice - - -class Dimensional(PandasObject): - """ - """ - - __array_priority__ = 10 - _typ = 'dimensional' - - def __init__(self, values, dtype): - # TODO: Sanitize - self.values = values - self.dtype = dtype - - @property - def _constructor(self): - return Dimensional - - def copy(self): - """ Copy constructor. """ - return self._constructor(self.values.copy(), self.dtype) - - def astype(self, dtype, copy=True): - """ - Coerce this type to another dtype - """ - return np.array(self, dtype=dtype, copy=copy) - - @cache_readonly - def ndim(self): - """Number of dimensions """ - return self.values.ndim - - @cache_readonly - def size(self): - """ return the len of myself """ - return len(self) - - @property - def base(self): - """ compat, we are always our own object """ - return None - - # for Series/ndarray like compat - @property - def shape(self): - """ Shape of the Categorical. - - For internal compatibility with numpy arrays. - - Returns - ------- - shape : tuple - """ - return tuple([len(self.values)]) - - def __array__(self, dtype=None): - """ - The numpy array interface. - - Returns - ------- - values : numpy array - A numpy array of either the specified dtype or, - if dtype==None (default), the same dtype as - categorical.categories.dtype - """ - if dtype: - return np.asarray(self.values, dtype) - return self.values - - @property - def T(self): - return self - - def isna(self): - raise NotImplementedError - isnull = isna - - def notna(self): - """ - Inverse of isna - - Both missing values (-1 in .codes) and NA as a category are detected as - null. - - Returns - ------- - a boolean array of whether my values are not null - - See also - -------- - notna : top-level notna - notnull : alias of notna - Categorical.isna : boolean inverse of Categorical.notna - - """ - return ~self.isna() - notnull = notna - - def put(self, *args, **kwargs): - """ - Replace specific elements in the Categorical with given values. - """ - raise NotImplementedError(("'put' is not yet implemented " - "for Categorical")) - - def dropna(self): - raise NotImplementedError - - def get_values(self): - """ Return the values. - - For internal compatibility with pandas formatting. - - Returns - ------- - values : numpy array - A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods - """ - return np.array(self) - - def ravel(self, order='C'): - """ Return a flattened (numpy) array. - - For internal compatibility with numpy arrays. - - Returns - ------- - raveled : numpy array - """ - return np.array(self) - - def view(self): - """Return a view of myself. - - For internal compatibility with numpy arrays. - - Returns - ------- - view : Categorical - Returns `self`! - """ - return self - - def to_dense(self): - """Return my 'dense' representation - - For internal compatibility with numpy arrays. - - Returns - ------- - dense : array - """ - return np.asarray(self) - - def fillna(self, value=None, method=None, limit=None): - """ Fill NA/NaN values using the specified method. - - Parameters - ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - value : scalar - Value to use to fill holes (e.g. 0) - limit : int, default None - (Not implemented yet for Categorical!) - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - Returns - ------- - filled : Categorical with NA/NaN filled - """ - raise NotImplementedError - - def _slice(self, slicer): - """ Return a slice of myself. - - For internal compatibility with numpy arrays. - """ - - # only allow 1 dimensional slicing, but can - # in a 2-d case be passd (slice(None),....) - if isinstance(slicer, tuple) and len(slicer) == 2: - if not is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") - slicer = slicer[1] - - return self._constructor(self.values[slicer], self.dtype) - - def __len__(self): - """The length of this Categorical.""" - return len(self.values) - - def __iter__(self): - """Returns an Iterator over the values of this Categorical.""" - return iter(self.get_values()) - - def _tidy_repr(self, max_vals=10, footer=True): - """ a short repr displaying only max_vals and an optional (but default - footer) - """ - num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) - - result = '%s, ..., %s' % (head[:-1], tail[1:]) - if footer: - result = '%s\n%s' % (result, self._repr_footer()) - - return compat.text_type(result) - - def _repr_footer(self): - return 'Length: %d' % (len(self)) - - def _get_repr(self, length=True, na_rep='NaN', footer=True): - return "Dimensional {}".format(self.__array__()) - # TODO: Implement properly - - def __unicode__(self): - """ Unicode representation. """ - # TODO: implement - return self._tidy_repr() - - def __getitem__(self, key): - """ Return an item. """ - return Dimensional(values=self.values[key], dtype=self.dtype) - - def __setitem__(self, key, value): - raise NotImplementedError diff --git a/pandas/core/ops.py b/pandas/core/ops.py index ddf4e5b8889b3..b82e12034b878 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -350,9 +350,10 @@ def __init__(self, left, right, name, na_op): super(_DimFloatOp, self).__init__(left, right, name, na_op) # Get the type of the calculation's result. self.dtype = self._get_target_dtype(left, right, name) - # - self.lvalues = self._with_unit(left.values) - self.rvalues = self._with_unit(right.values) + + print("left, right", type(left), type(right)) + self.lvalues = self._with_unit(left) + self.rvalues = self._with_unit(right) print ("lvals, rvals", type(self.lvalues), type(self.rvalues)) @classmethod @@ -360,8 +361,8 @@ def _get_target_dtype(cls, left, right, name): # Perform the operation on 1* the unit, # to quickly get the resulting unit # Raises an Error, if the units are incompatible - left_unit = cls._get_unit(left.values) - right_unit = cls._get_unit(right.values) + left_unit = cls._get_unit(left) + right_unit = cls._get_unit(right) calc_result = (getattr(1 * left_unit, name)(1 * right_unit)) if isinstance(calc_result, bool): return bool @@ -370,17 +371,22 @@ def _get_target_dtype(cls, left, right, name): @staticmethod def _with_unit(data): + print("with unit: ", data.dtype, type(data.dtype)) if hasattr(data.dtype, "unit"): return data.dtype.unit * data.values - return data.values + return data @staticmethod def _get_unit(data): - if hasattr(data.dtype, "unit"): - return data.dtype.unit + try: + if hasattr(data.dtype, "unit"): + return data.dtype.unit + except AttributeError: + pass return unit_registry.dimensionless + class _TimeOp(_Op): """ Wrapper around Series datetime/time/timedelta arithmetic operations. diff --git a/pandas/core/series.py b/pandas/core/series.py index 0ad726ead1cfd..3177045bca146 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -55,7 +55,6 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor -from pandas.core.dimensioned import Dimensional import pandas.core.strings as strings from pandas.core.indexes.accessors import ( maybe_to_datetimelike, CombinedDatetimelikeProperties) @@ -152,6 +151,7 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin, def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): + print("__init__: data = {}, dtype = {}".format(repr(data), repr(dtype))) # we are called internally, so short-circuit if fastpath: @@ -264,7 +264,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data, fastpath=True) - + if is_dimensionedFloat_dtype(dtype): + self._extension_dtype = dtype + else: + self._extension_dtype = None self.name = name self._set_axis(0, index, fastpath=True) @@ -345,7 +348,10 @@ def name(self, value): @property def dtype(self): """ return the dtype object of the underlying data """ - return self._data.dtype + if self._extension_dtype is None: + return self._data.dtype + else: + return self._extension_dtype @property def dtypes(self): @@ -3012,8 +3018,7 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) else: if is_dimensionedFloat_dtype(dtype): - subarr = Dimensional(np.asarray(subarr, dtype="float64"), - dtype=dtype) + subarr = np.asarray(subarr, dtype=dtype.base) except (ValueError, TypeError): if is_categorical_dtype(dtype): subarr = Categorical(arr) @@ -3082,6 +3087,9 @@ def create_from_value(value, index, dtype): subarr = DatetimeIndex([value] * len(index), dtype=dtype) elif is_categorical_dtype(dtype): subarr = Categorical([value] * len(index)) + elif is_dimensionedFloat_dtype(dtype): + subarr = np.empty(len(index), dtype=dtype.base) + subarr.fill(value) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype @@ -3090,7 +3098,9 @@ def create_from_value(value, index, dtype): return subarr # scalar like, GH + print(type(subarr), dir(subarr)) if getattr(subarr, 'ndim', 0) == 0: + print("subarr", type(subarr), subarr) if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py index 60dd5b695864f..446af2cd1bbec 100644 --- a/pandas/tests/dtypes/test_units.py +++ b/pandas/tests/dtypes/test_units.py @@ -26,22 +26,35 @@ def test_equality(): assert a == a -def test_with_series(): +def test_series_construction(): + a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) + assert a.dtype == DimensionedFloatDtype("meter") - a = pd.Series([1, 2, 3, 4, 5], dtype="datetime64[ns]") +def test_series_addition_same_unit(): a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) b = a + a assert b[0] == 2 assert b.dtype == DimensionedFloatDtype("meter") + assert_allclose(b, pd.Series([2, 4, 6, 8, 10.], + dtype=DimensionedFloatDtype("meter"))) + +def test_series_addition_compatible_units(): + a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) c = pd.Series([5, 10, 50, 100, 500], dtype=DimensionedFloatDtype("centimeter")) assert_allclose(a + c, pd.Series([1.05, 2.1, 3.5, 5, 10.], dtype=DimensionedFloatDtype("meter"))) - assert_allclose(a * c, + +def test_series_addition_compatible_units(): + a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) + c = pd.Series([5, 10, 50, 100, 500], + dtype=DimensionedFloatDtype("centimeter")) + product = a * c + assert_allclose(product, pd.Series([5, 20, 150, 400, 2500.], dtype=DimensionedFloatDtype("meter*centimeter"))) - + assert product.dtype == DimensionedFloatDtype("meter*centimeter") class TestUnitDtype(Base): def create(self): From 950d922efc7faf6c7700aea7aa7cef021d85131b Mon Sep 17 00:00:00 2001 From: Bernhard Thiel Date: Thu, 3 Aug 2017 12:46:48 +0200 Subject: [PATCH 9/9] Restructured Code. Now providing a framework with which external libraries can interface with pandas dtypes. --- .travis.yml | 2 +- pandas/core/categorical.py | 1 + pandas/core/dtypes/common.py | 30 ++--- pandas/core/dtypes/costum_dtypes.py | 187 ++++++++++++++++++++++++++++ pandas/core/dtypes/units.py | 86 ------------- pandas/core/ops.py | 79 ++++-------- pandas/core/series.py | 11 +- pandas/tests/dtypes/test_costum.py | 65 ++++++++++ pandas/tests/dtypes/test_units.py | 94 -------------- 9 files changed, 299 insertions(+), 256 deletions(-) create mode 100644 pandas/core/dtypes/costum_dtypes.py delete mode 100644 pandas/core/dtypes/units.py create mode 100644 pandas/tests/dtypes/test_costum.py delete mode 100644 pandas/tests/dtypes/test_units.py diff --git a/.travis.yml b/.travis.yml index 9aed640406733..897d31cf23a3b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,7 +104,7 @@ before_install: - git tag - ci/before_install_travis.sh - export DISPLAY=":99.0" - - pip install pint + install: - echo "install start" - ci/prep_cython_cache.sh diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2530b3e538aa1..1392ad2f011db 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -249,6 +249,7 @@ class Categorical(PandasObject): _typ = 'categorical' def __init__(self, values, categories=None, ordered=False, fastpath=False): + self._validate_ordered(ordered) if fastpath: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 001bed0ae5f16..a05018edf0798 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,13 +9,13 @@ PeriodDtype, PeriodDtypeType, IntervalDtype, IntervalDtypeType, ExtensionDtype) -from .units import DimensionedFloatDtype, DimensionedFloatDtypeType from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass) from .inference import is_string_like from .inference import * # noqa +from .costum_dtypes import NumpyDtypeWithMetadata, NumpyDtypeWithMetadataType _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name @@ -509,10 +509,13 @@ def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) -def is_dimensionedFloat_dtype(arr_or_dtype): +def is_numpy_dtype_with_metadata(arr_or_dtype): + """ + Check whether the array or dtype is an instance of NumpyDtypeWithMetadata + """ if arr_or_dtype is None: return False - return DimensionedFloatDtype.is_dtype(arr_or_dtype) + return NumpyDtypeWithMetadata.is_dtype(arr_or_dtype) def is_string_dtype(arr_or_dtype): @@ -693,6 +696,7 @@ def is_dtype_equal(source, target): target = _get_dtype(target) return source == target except (TypeError, AttributeError): + # invalid comparison # object == category will hit this return False @@ -1621,7 +1625,7 @@ def is_extension_type(arr): return True elif is_datetimetz(arr): return True - elif is_dimensionedFloat_dtype(arr): + elif is_numpy_dtype_with_metadata(arr): return True return False @@ -1725,9 +1729,8 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, IntervalDtype): return arr_or_dtype - elif isinstance(arr_or_dtype, DimensionedFloatDtype): + elif isinstance(arr_or_dtype, NumpyDtypeWithMetadata): return arr_or_dtype - elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -1737,8 +1740,6 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) - elif arr_or_dtype.startswith("dimensionedFloat"): - return DimensionedFloatDtype.construct_from_string(arr_or_dtype) elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): return arr_or_dtype.dtype @@ -1775,8 +1776,8 @@ def _get_dtype_type(arr_or_dtype): return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType - elif isinstance(arr_or_dtype, DimensionedFloatDtype): - return DimensionedFloatDtypeType + elif isinstance(arr_or_dtype, NumpyDtypeWithMetadata): + return NumpyDtypeWithMetadataType elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtypeType @@ -1786,8 +1787,6 @@ def _get_dtype_type(arr_or_dtype): return PeriodDtypeType elif is_interval_dtype(arr_or_dtype): return IntervalDtypeType - elif arr_or_dtype.startswith("dimensionedFloat"): - return DimensionedFloatDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -1896,7 +1895,7 @@ def pandas_dtype(dtype): if isinstance(dtype, DatetimeTZDtype): return dtype - elif isinstance(dtype, DimensionedFloatDtype): + elif isinstance(dtype, NumpyDtypeWithMetadata): return dtype elif isinstance(dtype, PeriodDtype): return dtype @@ -1923,11 +1922,6 @@ def pandas_dtype(dtype): except TypeError: pass - elif dtype.startswith('dimensionedFloat['): - try: - return DimensionedFloatDtype.construct_from_string(dtype) - except TypeError: - pass try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/core/dtypes/costum_dtypes.py b/pandas/core/dtypes/costum_dtypes.py new file mode 100644 index 0000000000000..d3cdf02077ebb --- /dev/null +++ b/pandas/core/dtypes/costum_dtypes.py @@ -0,0 +1,187 @@ +""" +This module contains an interface that external libraries can use to define +their own dtypes compatible with pandas (but NOT NUMPY). +""" + +import numpy as np + +from .dtypes import ExtensionDtype + +class NumpyDtypeWithMetadataType(type): # Do we need this? + """ + The type of NumpyDtypeWithMetadata + """ + pass + +class NumpyDtypeWithMetadata(ExtensionDtype): + + """ + An ExtentionDtype for data where the data + can be stored in a numpy dtype, but the dtype itself + contains meta-data and may redefine arithmetic operations. + + To properly implement caching behaviour, + you might have to implement a __new__ method. + """ + type = NumpyDtypeWithMetadataType + # What attributes should be stored during pickling? + # If this is provided, you usually do not have to + # override __getstate__ + _metadata = [] + + def base(self): + """ + In what numpy-compatible dtype the actual data is stored. + + Example: np.dtype('f8') + """ + raise NotImplementedError("'base' must be implemented by subclass " + "(probably as class-level variable)") + + + @classmethod + def construct_from_string(cls, string): + """ attempt to construct this type from a string, raise a TypeError if + it's not possible """ + raise NotImplementedError("'construct_from_string' must be implemented by subclass.") + + def operation_typecompatible(self, operation_name, other_dtype, is_left=True): + """ + Is the desired operation possible between this dtype and other_dtype? + + Parameters + ---------- + opertation_name: The name of the desired operation, e.g. '__eq__' + other_dtype: The dtype of the other operand + is_left: If this dtype is on the left-hand side of the binary operation. + + Returns + ------- + Boolean or NotImplemented + """ + return False + + def get_operation_wrapper(self): + """ + This is called by `pandas.ops._Op.get_op` to get an object + responsible for type-coercion (which should have the same interface as _Op) + + Returns + ------- + A class implementing the same interface as pandas.ops._Op or None + It should return None, if the default _Op class should be used. + """ + return None + + def to_dtype(self, data): + """ + Convert arbitrary data to this dtype. + + Override this, if you need any additional conversions. + + Parameters + ---------- + data: array-like + + Returns + ------- + An numpy array with the same dtype as self.base + """ + return np.asarray(data, dtype = self.base) + +class AlwaysSame(NumpyDtypeWithMetadata): + """ + This is an example how a library could implement a + subclass of NumpyDtypeWithMetadata, but is it (except for testing) + not useful for anything else. + """ + _metadata = [ "_target_value", "base"] + def __new__(cls, target_value=None): + if target_value is None: + #We are unpickling + return object.__new__(cls) + try: + return cls._cache[target_value] + except KeyError: + d = object.__new__(cls) + d._target_value = target_value + # In this case, we set the base numpy dtype upon object construction. + d.base = np.dtype(type(target_value)) #Raises, if target_value is not a simple number + cls._cache[target_value] = d + return d + + def __hash__(self): + return hash(self._target_value) + + def __unicode__(self): + return "always[{}]".format(repr(self._target_value)) + + def __setstate__(self, state): + try: + self._target_value = state["_target_value"] + except KeyError: + print("state", state) + raise + self.base = np.dtype(type(self._target_value)) + + def __eq__(self, other): + if not isinstance(other, AlwaysSame): + return NotImplemented + return self._target_value == other._target_value + + def to_dtype(self, data): + """ + Fill the array with the target value. + """ + # Since performance is irrelevant for this Test-dtype, we + # do not try to modify data in-place + data = np.ones(np.shape(data), dtype=self.base) + data = data*self._target_value + return data + + def get_operation_wrapper(self): + """ + This is called by `pandas.ops._Op.get_op` to get an object + responsible for type-coercion (which should have the same interface as _Op) + + Returns + ------- + A class implementing the same interface as pandas.ops._Op or None + It should return None, if the default _Op class should be used. + """ + class AlwaysSameOp(): + dtype = None + fill_value = self._target_value + def __init__(self, left, right, name, na_op): + self.left = left + self.right = right + + self.name = name + self.na_op = na_op + + # Here, a conversion of left and right to lvalues and rvalues could take place. + # lvalues must be a type that has the desired operator defined. + self.lvalues = left + self.rvalues = right + return None + def wrap_results(self, results): + print("l,r ", type(self.left), type(self.right)) + # Comparison operators return dtype bool. + if self.name in ["__eq__", "__lt__", "__gt__", "__ge__", "__le__", "__ne__"]: + return results + # All other operators return dtype AlwaysSame + if isinstance(self.left.dtype, AlwaysSame): + target_dtype = self.left.dtype + else: + assert isinstance(self.right.dtype, AlwaysSame) + target_dtype = self.right.dtype + return target_dtype.to_dtype(results) + return AlwaysSameOp + + def operation_typecompatible(self, name, other, is_left=True): + if isinstance(other, AlwaysSame): + if other._target_value != self._target_value: + if type(other) != AlwaysSame: + return NotImplemented #Allow + return False + return True diff --git a/pandas/core/dtypes/units.py b/pandas/core/dtypes/units.py deleted file mode 100644 index f5de32b507b23..0000000000000 --- a/pandas/core/dtypes/units.py +++ /dev/null @@ -1,86 +0,0 @@ - - -import pint - -import numpy as np - -from pandas import compat -from .dtypes import ExtensionDtype - -unit_registry = pint.UnitRegistry() - - -class DimensionedFloatDtypeType(type): - """ - The type of UnitDtype. - """ - pass - - -class DimensionedFloatDtype(ExtensionDtype): - - """ - A dtype for holding float64 nubers with units - - THIS IS NOT A REAL NUMPY DTYPE - """ - type = DimensionedFloatDtypeType - _metadata = ['unit'] - _cache = {} - kind = "f" - str = "f8" - base = np.dtype('f8') - - def __new__(cls, unit=None): - """ Create a new unit if needed, otherwise return from the cache - - Parameters - ---------- - unit : string unit that this represents. - """ - if unit is None: - # we are called as an empty constructor - # generally for pickle compat - return object.__new__(cls) - - # Assume unit is a string. - unit_object = getattr(unit_registry, unit) # TypeError if unit!=string - - # set/retrieve from cache - try: - return cls._cache[unit] - except KeyError: - u = object.__new__(cls) - u.unit = unit_object - cls._cache[unit] = u - return u - - def __hash__(self): - return hash(str(self)) - - def __unicode__(self): - return "dimensionedFloat[{unit}]".format(unit=str(self.unit)) - - def __setstate__(self, state): - # Use the same unit but from our registry, not the pickled unit - # Mixing units from different registries causes errors. - self.unit = getattr(unit_registry, str(state["unit"])) - - def __eq__(self, other): - if isinstance(other, compat.string_types): - return other == str(self) - if not isinstance(other, DimensionedFloatDtype): - return NotImplemented - return self.unit == other.unit - - @classmethod - def construct_from_string(cls, string): - """ attempt to construct this type from a string, raise a TypeError if - it's not possible """ - try: - typename, unit = string.split("[") - if unit[-1] == "]" and typename == "dimensionedFloat": - return cls(unit[:-1]) - except: - pass - raise TypeError("cannot construct a DimensionedFloatDtype") diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b82e12034b878..018440d3fc6e3 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -24,13 +24,12 @@ from pandas.errors import PerformanceWarning from pandas.core.common import _values_from_object, _maybe_match_name from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.units import unit_registry, DimensionedFloatDtype from pandas.core.dtypes.common import ( needs_i8_conversion, is_datetimelike_v_numeric, is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, - is_dimensionedFloat_dtype, + is_numpy_dtype_with_metadata, is_datetime64_dtype, is_datetime64tz_dtype, is_bool_dtype, is_datetimetz, is_list_like, @@ -332,61 +331,37 @@ def get_op(cls, left, right, name, na_op): is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) - if (is_dimensionedFloat_dtype(left.dtype) or - is_dimensionedFloat_dtype(right.dtype)): - if (is_datetime_lhs or is_timedelta_lhs): - raise TypeError("Cannot mix DimensionedFloat and " - "Time for operations") - return _DimFloatOp(left, right, name, na_op) + if is_numpy_dtype_with_metadata(left): + left_compatible = left.dtype.operation_typecompatible(name, right.dtype, is_left=True) + if left_compatible is not NotImplemented: + if left_compatible: + op_class = left.dtype.get_operation_wrapper() + if op_class is not None: + return op_class(left, right, name, na_op) + else: + return _Op(left, right, name, na_op) + else: + raise TypeError("Operation {} not permitted between " + "dtype {} and type {}".format(name, left.dtype, + right.dtype)) + # left is either not a NumpyDtypeWithMetadata or did not implement the Operation. + if is_numpy_dtype_with_metadata(right): + if right.dtype.operation_typecompatible(name, left.dtype, is_left=False): + op_class = right.dtype.get_operation_wrapper() + if op_class is not None: + return op_class(left, right, name, na_op) + else: + return _Op(left, right, name, na_op) + else: + raise TypeError("Operation {} not permitted between " + "dtype {} and type {}".format(left.dtype, + right.dtype)) + # No NumpyDtypeWithMetadata involved. if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op) - -class _DimFloatOp(_Op): - def __init__(self, left, right, name, na_op): - - super(_DimFloatOp, self).__init__(left, right, name, na_op) - # Get the type of the calculation's result. - self.dtype = self._get_target_dtype(left, right, name) - - print("left, right", type(left), type(right)) - self.lvalues = self._with_unit(left) - self.rvalues = self._with_unit(right) - print ("lvals, rvals", type(self.lvalues), type(self.rvalues)) - - @classmethod - def _get_target_dtype(cls, left, right, name): - # Perform the operation on 1* the unit, - # to quickly get the resulting unit - # Raises an Error, if the units are incompatible - left_unit = cls._get_unit(left) - right_unit = cls._get_unit(right) - calc_result = (getattr(1 * left_unit, name)(1 * right_unit)) - if isinstance(calc_result, bool): - return bool - else: - return DimensionedFloatDtype(str(calc_result.units)) - - @staticmethod - def _with_unit(data): - print("with unit: ", data.dtype, type(data.dtype)) - if hasattr(data.dtype, "unit"): - return data.dtype.unit * data.values - return data - - @staticmethod - def _get_unit(data): - try: - if hasattr(data.dtype, "unit"): - return data.dtype.unit - except AttributeError: - pass - return unit_registry.dimensionless - - - class _TimeOp(_Op): """ Wrapper around Series datetime/time/timedelta arithmetic operations. diff --git a/pandas/core/series.py b/pandas/core/series.py index 3177045bca146..21042d5137363 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,7 +23,7 @@ is_datetimelike, is_datetime64tz_dtype, is_timedelta64_dtype, - is_dimensionedFloat_dtype, + is_numpy_dtype_with_metadata, is_list_like, is_hashable, is_iterator, @@ -264,7 +264,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data, fastpath=True) - if is_dimensionedFloat_dtype(dtype): + if is_numpy_dtype_with_metadata(dtype): self._extension_dtype = dtype else: self._extension_dtype = None @@ -3017,8 +3017,8 @@ def _try_cast(arr, take_fast_path): if not is_extension_type(subarr) and not is_extension_type(dtype): subarr = np.array(subarr, dtype=dtype, copy=copy) else: - if is_dimensionedFloat_dtype(dtype): - subarr = np.asarray(subarr, dtype=dtype.base) + if is_numpy_dtype_with_metadata(dtype): + subarr = dtype.to_dtype(subarr) except (ValueError, TypeError): if is_categorical_dtype(dtype): subarr = Categorical(arr) @@ -3087,9 +3087,10 @@ def create_from_value(value, index, dtype): subarr = DatetimeIndex([value] * len(index), dtype=dtype) elif is_categorical_dtype(dtype): subarr = Categorical([value] * len(index)) - elif is_dimensionedFloat_dtype(dtype): + elif is_numpy_dtype_with_metadata(dtype): subarr = np.empty(len(index), dtype=dtype.base) subarr.fill(value) + subarr = dtype.to_dtype(subarr) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype diff --git a/pandas/tests/dtypes/test_costum.py b/pandas/tests/dtypes/test_costum.py new file mode 100644 index 0000000000000..5750f45117737 --- /dev/null +++ b/pandas/tests/dtypes/test_costum.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +import pytest + +import numpy as np +from numpy.testing import assert_allclose + +import pandas as pd +from pandas.core.dtypes.costum_dtypes import NumpyDtypeWithMetadata, AlwaysSame +from pandas.core.dtypes.common import is_dtype_equal + +from .test_dtypes import Base + +def test_alwys_same_series_construction(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + # The '==' operator for the whole series will be tested elsewhere + for i in range(0,6): + assert a[i]==42 + +def test_always_same_with_series_eq(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + assert np.all(a == a) + assert np.all(a == pd.Series([42,42,42,42,42,42], dtype = int)) + assert np.all(a != pd.Series([1,2,4,4,4,4], dtype = AlwaysSame(41))) + assert np.all(a == pd.Series([1,2,4,4,4,4], dtype = AlwaysSame(42))) + + +def test_always_same_with_series_add(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + assert (a == a + a).all() + + +def test_always_same_with_series_add_mixing_types(): + a = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(42)) + b = pd.Series([1,2,3,4,5,6], dtype = AlwaysSame(43)) + c = pd.Series([1,2,3,4,5,6], dtype = int) + pytest.raises(TypeError, + lambda: a + b) + pytest.raises(TypeError, + lambda: b + a) + assert (a + c == a).all() + assert (c + a == a).all() + + +class TestAlwaysSame(Base): + def create(self): + return AlwaysSame(42) + + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = AlwaysSame(42) + assert dtype == dtype2 + assert dtype2 == dtype + assert dtype is dtype2 + assert dtype2 is dtype + assert hash(dtype) == hash(dtype2) + + +class TestNumpyDtypeWithMetadata: + + def test_is_not_dtype(self): + assert not NumpyDtypeWithMetadata.is_dtype(None) + assert not NumpyDtypeWithMetadata.is_dtype(np.float64) + assert NumpyDtypeWithMetadata.is_dtype(AlwaysSame('Beer')) diff --git a/pandas/tests/dtypes/test_units.py b/pandas/tests/dtypes/test_units.py deleted file mode 100644 index 446af2cd1bbec..0000000000000 --- a/pandas/tests/dtypes/test_units.py +++ /dev/null @@ -1,94 +0,0 @@ -# -*- coding: utf-8 -*- - -import pytest - -import numpy as np -from numpy.testing import assert_allclose - -import pandas as pd -from pandas.core.dtypes.units import DimensionedFloatDtype -from pandas.core.dtypes.common import is_dtype_equal - -from .test_dtypes import Base - - -def test_construction_string(): - """ - Assert that ainstances of UnitDType have - """ - a = DimensionedFloatDtype("meter") - assert isinstance(a, DimensionedFloatDtype) - - -def test_equality(): - assert DimensionedFloatDtype("meter") == DimensionedFloatDtype("meter") - a = DimensionedFloatDtype("meter") - assert a == a - - -def test_series_construction(): - a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) - assert a.dtype == DimensionedFloatDtype("meter") - -def test_series_addition_same_unit(): - a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) - b = a + a - assert b[0] == 2 - assert b.dtype == DimensionedFloatDtype("meter") - assert_allclose(b, pd.Series([2, 4, 6, 8, 10.], - dtype=DimensionedFloatDtype("meter"))) - -def test_series_addition_compatible_units(): - a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) - c = pd.Series([5, 10, 50, 100, 500], - dtype=DimensionedFloatDtype("centimeter")) - assert_allclose(a + c, - pd.Series([1.05, 2.1, 3.5, 5, 10.], - dtype=DimensionedFloatDtype("meter"))) - -def test_series_addition_compatible_units(): - a = pd.Series([1, 2, 3, 4, 5], dtype=DimensionedFloatDtype("meter")) - c = pd.Series([5, 10, 50, 100, 500], - dtype=DimensionedFloatDtype("centimeter")) - product = a * c - assert_allclose(product, - pd.Series([5, 20, 150, 400, 2500.], - dtype=DimensionedFloatDtype("meter*centimeter"))) - assert product.dtype == DimensionedFloatDtype("meter*centimeter") - -class TestUnitDtype(Base): - def create(self): - return DimensionedFloatDtype("meter") - - def test_hash_vs_equality(self): - # make sure that we satisfy is semantics - dtype = self.dtype - dtype2 = DimensionedFloatDtype('meter') - assert dtype == dtype2 - assert dtype2 == dtype - assert dtype is dtype2 - assert dtype2 is dtype - assert hash(dtype) == hash(dtype2) - - def test_construction(self): - pytest.raises(Exception, # pint.UndefinedUnitError - lambda: DimensionedFloatDtype('thisIsNotAUnit')) - - def test_construction_from_string(self): - result = DimensionedFloatDtype.construct_from_string( - 'dimensionedFloat[meter]') - assert is_dtype_equal(self.dtype, result) - pytest.raises(TypeError, - lambda: DimensionedFloatDtype.construct_from_string('foo')) - - def test_is_dtype(self): - assert not DimensionedFloatDtype.is_dtype(None) - assert DimensionedFloatDtype.is_dtype(self.dtype) - assert DimensionedFloatDtype.is_dtype('dimensionedFloat[meter]') - assert not DimensionedFloatDtype.is_dtype('foo') - assert DimensionedFloatDtype.is_dtype(DimensionedFloatDtype('hours')) - assert not DimensionedFloatDtype.is_dtype(np.float64) - - def test_equality(self): - assert is_dtype_equal(self.dtype, 'dimensionedFloat[meter]') - assert not is_dtype_equal(self.dtype, 'foo')