Skip to content

WIP: Dict Array Extension #29557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pandas.core.arrays import (
Categorical,
DatetimeArray,
DictArray,
IntegerArray,
IntervalArray,
PandasArray,
Expand All @@ -18,6 +19,7 @@
__all__ = [
"Categorical",
"DatetimeArray",
"DictArray",
"IntegerArray",
"IntervalArray",
"PandasArray",
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
)
from .categorical import Categorical # noqa: F401
from .datetimes import DatetimeArray # noqa: F401
from .dict import DictArray # noqa: F401
from .integer import IntegerArray, integer_array # noqa: F401
from .interval import IntervalArray # noqa: F401
from .numpy_ import PandasArray, PandasDtype # noqa: F401
Expand Down
134 changes: 134 additions & 0 deletions pandas/core/arrays/dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from typing import Any, Dict, Hashable, Optional, Sequence, Type

import numpy as np

from pandas._libs import lib

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import register_extension_dtype

import pandas as pd
from pandas._typing import Axis, Dtype
import pandas.core.accessor as accessor
from pandas.core.arrays.numpy_ import PandasArray
from pandas.core.construction import extract_array


@register_extension_dtype
class DictDtype(ExtensionDtype):
"""
Extension dtype for nested dictionaries.

.. versionadded:: 1.0.0

.. warning::

DictDtype is considered experimental. The implementation and
parts of the API may change without warning.

"""

@property
def na_value(self) -> float:
return np.nan

@property
def type(self) -> Type:
return dict

@property
def name(self) -> str:
"""
The alias for DictDtype is ``'dict'``.
"""
return "dict"

@classmethod
def construct_from_string(cls, string: str) -> ExtensionDtype:
if string == "dict":
return cls()

return super().construct_from_string(string)

@classmethod
def construct_array_type(cls) -> Type["DictArray"]:
return DictArray

def __repr__(self) -> str:
return "DictDtype"


class DictArray(PandasArray):
"""
Extension array for nested dictionaries.

.. versionadded:: 1.0.0

.. warning::

DictArray is considered experimental. The implementation and
parts of the API may change without warning.
"""

# undo the PandasArray hack
_typ = "extension"

def __init__(self, values: Sequence[Dict], copy: bool = False):
np_values = extract_array(values)
super().__init__(np_values, copy=copy)
self._dtype = DictDtype()
self._validate()

def _validate(self):
"""Validate that we only store dicts."""
if self._ndarray.dtype != "object":
raise ValueError(
"DictArray requires a sequence of dicts. Got "
"'{}' dtype instead.".format(self._ndarray.dtype)
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to extend this validation for object dtypes on construction that contain more than just dicts...


@classmethod
def _from_sequence(
cls, dicts: Sequence[Dict], dtype: Optional[Dtype] = None, copy: bool = False
) -> "DictArray":
if dtype:
assert dtype == "dict"

result = super()._from_sequence(dicts, dtype=object, copy=copy)
# convert None to np.nan
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
result[result.isna()] = np.nan
return result

def __setitem__(self, key: Hashable, value: Any):
def check_value(value):
if not (pd.isnull(value) or isinstance(value, dict)):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could potentially make this pretty class pretty generic adding an abstract class for all Container types then leave it to subclasses for a few minor things to clarify list vs set vs dict

raise TypeError(f"Cannot set non-dict value {value} into DictArray")

if lib.is_scalar(value):
check_value(value)
else:
for val in value:
check_value(val)

super().__setitem__(key, value)

def _reduce(self, name: str, axis: Axis = 0, **kwargs):
raise NotImplementedError


class DictAccessor(accessor.PandasDelegate):
def __init__(self, obj: DictArray):
if not isinstance(obj.array, DictArray):
raise AttributeError("Can only use .dict accessor with a DictArray")
self._obj = obj.array

def __getitem__(self, key: Hashable):
return pd.Series(x[key] for x in self._obj)

def get(self, key: Hashable, default=np.nan):
# TODO: default should be positional only
# TODO: justify np.nan - maybe because will coerce that way in
# resulting Series construction?
return pd.Series(x.get(key, None) for x in self._obj)
2 changes: 2 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import ExtensionArray, try_cast_to_ea
from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
from pandas.core.arrays.dict import DictAccessor
from pandas.core.arrays.sparse import SparseAccessor
import pandas.core.common as com
from pandas.core.construction import extract_array, sanitize_array
Expand Down Expand Up @@ -4759,6 +4760,7 @@ def to_period(self, freq=None, copy=True):
cat = CachedAccessor("cat", CategoricalAccessor)
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
sparse = CachedAccessor("sparse", SparseAccessor)
dict = CachedAccessor("dict", DictAccessor)

# ----------------------------------------------------------------------
# Add plotting methods to Series
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/arrays/dict/test_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
import pytest

import pandas as pd
import pandas.util.testing as tm


def test_none_to_nan():
a = pd.arrays.DictArray._from_sequence([{"a": 1}, None, {"a": 1}])
assert a[1] is not None
assert np.isnan(a[1])


def test_setitem_validates():
a = pd.arrays.DictArray._from_sequence([{"a": 1}, {"a": 2}])
with pytest.raises(TypeError, match="Cannot set non-dict value"):
a[0] = "not_a_dict"

with pytest.raises(TypeError, match="Cannot set non-dict value"):
a[:] = np.array([1, 2])


def test_constructor_raises():
with pytest.raises(ValueError, match="sequence of dicts"):
pd.arrays.DictArray(np.array(["a", "b"], dtype="S1"))

with pytest.raises(ValueError, match="sequence of dicts"):
pd.arrays.DictArray(np.array([]))


def test_reduce_raises():
arr = pd.Series([{"a": 1}, {"a": 2}, {"a": 3}], dtype="dict")

with pytest.raises(NotImplementedError):
arr.sum()


def test_getitem():
arr = pd.Series([{"a": 1}, {"a": 2}, {"a": 3}], dtype="dict")
result = arr.dict["a"]
expected = pd.Series([1, 2, 3])

tm.assert_series_equal(result, expected)


def test_get():
arr = pd.Series([{"a": 1}, {"b": 2}, {"a": 3}], dtype="dict")
result = arr.dict.get("a")
expected = pd.Series([1.0, np.nan, 3.0])

tm.assert_series_equal(result, expected)