diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 9870b5bed076d..fae9a84a0aeae 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -6,6 +6,7 @@ from pandas.core.arrays import ( Categorical, DatetimeArray, + DictArray, IntegerArray, IntervalArray, PandasArray, @@ -18,6 +19,7 @@ __all__ = [ "Categorical", "DatetimeArray", + "DictArray", "IntegerArray", "IntervalArray", "PandasArray", diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 03d998707c26b..5ee72c3d4947c 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -6,6 +6,7 @@ ) from .categorical import Categorical # noqa: F401 from .datetimes import DatetimeArray # noqa: F401 +from .dict import DictArray # noqa: F401 from .integer import IntegerArray, integer_array # noqa: F401 from .interval import IntervalArray # noqa: F401 from .numpy_ import PandasArray, PandasDtype # noqa: F401 diff --git a/pandas/core/arrays/dict.py b/pandas/core/arrays/dict.py new file mode 100644 index 0000000000000..f1c4cf17d2b98 --- /dev/null +++ b/pandas/core/arrays/dict.py @@ -0,0 +1,134 @@ +from typing import Any, Dict, Hashable, Optional, Sequence, Type + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype + +import pandas as pd +from pandas._typing import Axis, Dtype +import pandas.core.accessor as accessor +from pandas.core.arrays.numpy_ import PandasArray +from pandas.core.construction import extract_array + + +@register_extension_dtype +class DictDtype(ExtensionDtype): + """ + Extension dtype for nested dictionaries. + + .. versionadded:: 1.0.0 + + .. warning:: + + DictDtype is considered experimental. The implementation and + parts of the API may change without warning. + + """ + + @property + def na_value(self) -> float: + return np.nan + + @property + def type(self) -> Type: + return dict + + @property + def name(self) -> str: + """ + The alias for DictDtype is ``'dict'``. + """ + return "dict" + + @classmethod + def construct_from_string(cls, string: str) -> ExtensionDtype: + if string == "dict": + return cls() + + return super().construct_from_string(string) + + @classmethod + def construct_array_type(cls) -> Type["DictArray"]: + return DictArray + + def __repr__(self) -> str: + return "DictDtype" + + +class DictArray(PandasArray): + """ + Extension array for nested dictionaries. + + .. versionadded:: 1.0.0 + + .. warning:: + + DictArray is considered experimental. The implementation and + parts of the API may change without warning. + """ + + # undo the PandasArray hack + _typ = "extension" + + def __init__(self, values: Sequence[Dict], copy: bool = False): + np_values = extract_array(values) + super().__init__(np_values, copy=copy) + self._dtype = DictDtype() + self._validate() + + def _validate(self): + """Validate that we only store dicts.""" + if self._ndarray.dtype != "object": + raise ValueError( + "DictArray requires a sequence of dicts. Got " + "'{}' dtype instead.".format(self._ndarray.dtype) + ) + + @classmethod + def _from_sequence( + cls, dicts: Sequence[Dict], dtype: Optional[Dtype] = None, copy: bool = False + ) -> "DictArray": + if dtype: + assert dtype == "dict" + + result = super()._from_sequence(dicts, dtype=object, copy=copy) + # convert None to np.nan + # TODO: it would be nice to do this in _validate / lib.is_string_array + # We are already doing a scan over the values there. + result[result.isna()] = np.nan + return result + + def __setitem__(self, key: Hashable, value: Any): + def check_value(value): + if not (pd.isnull(value) or isinstance(value, dict)): + raise TypeError(f"Cannot set non-dict value {value} into DictArray") + + if lib.is_scalar(value): + check_value(value) + else: + for val in value: + check_value(val) + + super().__setitem__(key, value) + + def _reduce(self, name: str, axis: Axis = 0, **kwargs): + raise NotImplementedError + + +class DictAccessor(accessor.PandasDelegate): + def __init__(self, obj: DictArray): + if not isinstance(obj.array, DictArray): + raise AttributeError("Can only use .dict accessor with a DictArray") + self._obj = obj.array + + def __getitem__(self, key: Hashable): + return pd.Series(x[key] for x in self._obj) + + def get(self, key: Hashable, default=np.nan): + # TODO: default should be positional only + # TODO: justify np.nan - maybe because will coerce that way in + # resulting Series construction? + return pd.Series(x.get(key, None) for x in self._obj) diff --git a/pandas/core/series.py b/pandas/core/series.py index 15f405e244d0f..8e9d302da6977 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,6 +54,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.dict import DictAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import extract_array, sanitize_array @@ -4759,6 +4760,7 @@ def to_period(self, freq=None, copy=True): cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) + dict = CachedAccessor("dict", DictAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/tests/arrays/dict/test_dict.py b/pandas/tests/arrays/dict/test_dict.py new file mode 100644 index 0000000000000..807cd11c6d9d9 --- /dev/null +++ b/pandas/tests/arrays/dict/test_dict.py @@ -0,0 +1,51 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +def test_none_to_nan(): + a = pd.arrays.DictArray._from_sequence([{"a": 1}, None, {"a": 1}]) + assert a[1] is not None + assert np.isnan(a[1]) + + +def test_setitem_validates(): + a = pd.arrays.DictArray._from_sequence([{"a": 1}, {"a": 2}]) + with pytest.raises(TypeError, match="Cannot set non-dict value"): + a[0] = "not_a_dict" + + with pytest.raises(TypeError, match="Cannot set non-dict value"): + a[:] = np.array([1, 2]) + + +def test_constructor_raises(): + with pytest.raises(ValueError, match="sequence of dicts"): + pd.arrays.DictArray(np.array(["a", "b"], dtype="S1")) + + with pytest.raises(ValueError, match="sequence of dicts"): + pd.arrays.DictArray(np.array([])) + + +def test_reduce_raises(): + arr = pd.Series([{"a": 1}, {"a": 2}, {"a": 3}], dtype="dict") + + with pytest.raises(NotImplementedError): + arr.sum() + + +def test_getitem(): + arr = pd.Series([{"a": 1}, {"a": 2}, {"a": 3}], dtype="dict") + result = arr.dict["a"] + expected = pd.Series([1, 2, 3]) + + tm.assert_series_equal(result, expected) + + +def test_get(): + arr = pd.Series([{"a": 1}, {"b": 2}, {"a": 3}], dtype="dict") + result = arr.dict.get("a") + expected = pd.Series([1.0, np.nan, 3.0]) + + tm.assert_series_equal(result, expected)