Skip to content

Commit 2ef5216

Browse files
committed
REF: Define extension base classes
1 parent ca2d261 commit 2ef5216

File tree

9 files changed

+566
-80
lines changed

9 files changed

+566
-80
lines changed

pandas/core/arrays/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
from .base import ExtensionArray # noqa
12
from .categorical import Categorical # noqa

pandas/core/arrays/base.py

+201
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
"""An interface for extending pandas with custom arrays."""
2+
import abc
3+
4+
import numpy as np
5+
6+
from pandas.compat import add_metaclass
7+
8+
9+
_not_implemented_message = "{} does not implement {}."
10+
11+
12+
@add_metaclass(abc.ABCMeta)
13+
class ExtensionArray(object):
14+
"""Abstract base class for custom array types
15+
16+
pandas will recognize instances of this class as proper arrays
17+
with a custom type and will not attempt to coerce them to objects.
18+
19+
Subclasses are expected to implement the following methods.
20+
"""
21+
# ------------------------------------------------------------------------
22+
# Must be a Sequence
23+
# ------------------------------------------------------------------------
24+
@abc.abstractmethod
25+
def __getitem__(self, item):
26+
"""Select a subset of self
27+
28+
Notes
29+
-----
30+
As a sequence, __getitem__ should expect integer or slice ``key``.
31+
32+
For slice ``key``, you should return an instance of yourself, even
33+
if the slice is length 0 or 1.
34+
35+
For scalar ``key``, you may return a scalar suitable for your type.
36+
The scalar need not be an instance or subclass of your array type.
37+
"""
38+
# type (Any) -> Any
39+
40+
def __setitem__(self, key, value):
41+
# type: (Any, Any) -> None
42+
raise NotImplementedError(_not_implemented_message.format(
43+
type(self), '__setitem__')
44+
)
45+
46+
@abc.abstractmethod
47+
def __iter__(self):
48+
# type: () -> Iterator
49+
pass
50+
51+
@abc.abstractmethod
52+
def __len__(self):
53+
# type: () -> int
54+
pass
55+
56+
# ------------------------------------------------------------------------
57+
# Required attributes
58+
# ------------------------------------------------------------------------
59+
@property
60+
def base(self):
61+
"""The base array I am a view of. None by default."""
62+
63+
@property
64+
@abc.abstractmethod
65+
def dtype(self):
66+
"""An instance of 'ExtensionDtype'."""
67+
# type: () -> ExtensionDtype
68+
pass
69+
70+
@property
71+
def shape(self):
72+
# type: () -> Tuple[int, ...]
73+
return (len(self),)
74+
75+
@property
76+
def ndim(self):
77+
# type: () -> int
78+
"""Extension Arrays are only allowed to be 1-dimensional."""
79+
return 1
80+
81+
@property
82+
@abc.abstractmethod
83+
def nbytes(self):
84+
"""The number of bytes needed to store this object in memory."""
85+
# type: () -> int
86+
pass
87+
88+
# ------------------------------------------------------------------------
89+
# Additional Methods
90+
# ------------------------------------------------------------------------
91+
@abc.abstractmethod
92+
def isna(self):
93+
"""Boolean NumPy array indicating if each value is missing."""
94+
# type: () -> np.ndarray
95+
pass
96+
97+
# ------------------------------------------------------------------------
98+
# Indexing methods
99+
# ------------------------------------------------------------------------
100+
@abc.abstractmethod
101+
def take(self, indexer, allow_fill=True, fill_value=None):
102+
# type: (Sequence, bool, Optional[Any]) -> ExtensionArray
103+
"""For slicing"""
104+
105+
def take_nd(self, indexer, allow_fill=True, fill_value=None):
106+
"""For slicing"""
107+
# TODO: this isn't really nescessary for 1-D
108+
return self.take(indexer, allow_fill=allow_fill,
109+
fill_value=fill_value)
110+
111+
@abc.abstractmethod
112+
def copy(self, deep=False):
113+
# type: (bool) -> ExtensionArray
114+
"""Return a copy of the array."""
115+
116+
# ------------------------------------------------------------------------
117+
# Block-related methods
118+
# ------------------------------------------------------------------------
119+
@property
120+
def _fill_value(self):
121+
"""The missing value for this type, e.g. np.nan"""
122+
# type: () -> Any
123+
return None
124+
125+
@abc.abstractmethod
126+
def _formatting_values(self):
127+
# type: () -> np.ndarray
128+
# At the moment, this has to be an array since we use result.dtype
129+
"""An array of values to be printed in, e.g. the Series repr"""
130+
131+
@classmethod
132+
@abc.abstractmethod
133+
def _concat_same_type(cls, to_concat):
134+
# type: (Sequence[ExtensionArray]) -> ExtensionArray
135+
"""Concatenate multiple array
136+
137+
Parameters
138+
----------
139+
to_concat : sequence of this type
140+
141+
Returns
142+
-------
143+
ExtensionArray
144+
"""
145+
146+
@abc.abstractmethod
147+
def get_values(self):
148+
# type: () -> np.ndarray
149+
"""Get the underlying values backing your data
150+
"""
151+
pass
152+
153+
def _can_hold_na(self):
154+
"""Whether your array can hold missing values. True by default.
155+
156+
Notes
157+
-----
158+
Setting this to false will optimize some operations like fillna.
159+
"""
160+
# type: () -> bool
161+
return True
162+
163+
@property
164+
def is_sparse(self):
165+
"""Whether your array is sparse. True by default."""
166+
# type: () -> bool
167+
return False
168+
169+
def _slice(self, slicer):
170+
# type: (Union[tuple, Sequence, int]) -> 'ExtensionArray'
171+
"""Return a new array sliced by `slicer`.
172+
173+
Parameters
174+
----------
175+
slicer : slice or np.ndarray
176+
If an array, it should just be a boolean mask
177+
178+
Returns
179+
-------
180+
array : ExtensionArray
181+
Should return an ExtensionArray, even if ``self[slicer]``
182+
would return a scalar.
183+
"""
184+
return type(self)(self[slicer])
185+
186+
def value_counts(self, dropna=True):
187+
"""Optional method for computing the histogram of the counts.
188+
189+
Parameters
190+
----------
191+
dropna : bool, default True
192+
whether to exclude missing values from the computation
193+
194+
Returns
195+
-------
196+
counts : Series
197+
"""
198+
from pandas.core.algorithms import value_counts
199+
mask = ~np.asarray(self.isna())
200+
values = self[mask] # XXX: this imposes boolean indexing
201+
return value_counts(np.asarray(values), dropna=dropna)

pandas/core/arrays/categorical.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
from pandas.util._validators import validate_bool_kwarg
4545
from pandas.core.config import get_option
4646

47+
from .base import ExtensionArray
48+
4749

4850
def _cat_compare_op(op):
4951
def f(self, other):
@@ -149,7 +151,7 @@ def _maybe_to_categorical(array):
149151
"""
150152

151153

152-
class Categorical(PandasObject):
154+
class Categorical(ExtensionArray, PandasObject):
153155
"""
154156
Represents a categorical variable in classic R / S-plus fashion
155157
@@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs):
21312133
return self._constructor(values=codes, categories=self.categories,
21322134
ordered=self.ordered, fastpath=True)
21332135

2136+
# Interface things
2137+
# can_hold_na, concat_same_type, formatting_values
2138+
@property
2139+
def _can_hold_na(self):
2140+
return True
2141+
2142+
@classmethod
2143+
def _concat_same_type(self, to_concat):
2144+
from pandas.types.concat import union_categoricals
2145+
return union_categoricals(to_concat)
2146+
2147+
def _formatting_values(self):
2148+
return self
2149+
21342150
# The Series.cat accessor
21352151

21362152

pandas/core/dtypes/base.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""Extend pandas with custom array types"""
2+
import abc
3+
4+
from pandas.compat import add_metaclass
5+
6+
7+
@add_metaclass(abc.ABCMeta)
8+
class ExtensionDtype(object):
9+
"""A custom data type for your array.
10+
"""
11+
@property
12+
def type(self):
13+
"""Typically a metaclass inheriting from 'type' with no methods."""
14+
return type(self.name, (), {})
15+
16+
@property
17+
def kind(self):
18+
"""A character code (one of 'biufcmMOSUV'), default 'O'
19+
20+
See Also
21+
--------
22+
numpy.dtype.kind
23+
"""
24+
return 'O'
25+
26+
@property
27+
@abc.abstractmethod
28+
def name(self):
29+
"""An string identifying the data type.
30+
31+
Will be used in, e.g. ``Series.dtype``
32+
"""
33+
34+
@property
35+
def names(self):
36+
"""Ordered list of field names, or None if there are no fields"""
37+
return None
38+
39+
@classmethod
40+
def construct_from_string(cls, string):
41+
"""Attempt to construct this type from a string.
42+
43+
Parameters
44+
----------
45+
string : str
46+
47+
Returns
48+
-------
49+
self : instance of 'cls'
50+
51+
Raises
52+
------
53+
TypeError
54+
55+
Notes
56+
-----
57+
The default implementation checks if 'string' matches your
58+
type's name. If so, it calls your class with no arguments.
59+
"""
60+
if string == cls.name:
61+
return cls()
62+
else:
63+
raise TypeError("Cannot construct a '{}' from "
64+
"'{}'".format(cls, string))
65+
66+
@classmethod
67+
def is_dtype(cls, dtype):
68+
"""Check if we match 'dtype'
69+
70+
Parameters
71+
----------
72+
dtype : str or dtype
73+
74+
Returns
75+
-------
76+
is_dtype : bool
77+
78+
Notes
79+
-----
80+
The default implementation is True if
81+
82+
1. 'dtype' is a string that returns true for
83+
``cls.construct_from_string``
84+
2. 'dtype' is ``cls`` or a subclass of ``cls``.
85+
"""
86+
if isinstance(dtype, str):
87+
try:
88+
return isinstance(cls.construct_from_string(dtype), cls)
89+
except TypeError:
90+
return False
91+
else:
92+
return issubclass(dtype, cls)

pandas/core/dtypes/common.py

+32
Original file line numberDiff line numberDiff line change
@@ -1685,6 +1685,38 @@ def is_extension_type(arr):
16851685
return False
16861686

16871687

1688+
def is_extension_array_dtype(arr_or_dtype):
1689+
"""Check if an object is a pandas extension array type
1690+
1691+
Parameters
1692+
----------
1693+
arr_or_dtype : object
1694+
1695+
Returns
1696+
-------
1697+
bool
1698+
1699+
Notes
1700+
-----
1701+
This checks whether an object implements the pandas extension
1702+
array interface. In pandas, this includes:
1703+
1704+
* Categorical
1705+
* PeriodArray
1706+
* IntervalArray
1707+
* SparseArray
1708+
1709+
Third-party libraries may implement arrays or types satisfying
1710+
this interface as well.
1711+
"""
1712+
from pandas.core.arrays import ExtensionArray
1713+
1714+
# we want to unpack series, anything else?
1715+
if isinstance(arr_or_dtype, ABCSeries):
1716+
arr_or_dtype = arr_or_dtype.values
1717+
return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
1718+
1719+
16881720
def is_complex_dtype(arr_or_dtype):
16891721
"""
16901722
Check whether the provided array or dtype is of a complex dtype.

0 commit comments

Comments
 (0)