Skip to content

Commit e8620ab

Browse files
ENH: Array Interface and Categorical internals Refactor (pandas-dev#19268)
* REF: Define extension base classes * Updated for comments * removed take_nd * Changed to_dense to return get_values * Fixed docstrings, types * Removed is_sparse * Remove metaclasses from PeriodDtype and IntervalDtype * Fixup form_blocks rebase * Restore concat casting cat -> object * Remove _slice, clarify semantics around __getitem__ * Document and use take. * Clarify type, kind, init * Remove base * API: Remove unused __iter__ and get_values * API: Implement repr and str * Remove default value_counts for now * Fixed merge conflicts * Remove implementation of construct_from_string * Example implementation of take * Cleanup ExtensionBlock * Pass through ndim * Use series._values * Removed repr, updated take doc * Various cleanups * Handle get_values, to_dense, is_view * Docs * Remove is_extension, is_bool Remove inherited convert * Sparse formatter * Revert "Sparse formatter" This reverts commit ab2f045. * Unbox SparseSeries * Added test for sparse consolidation * Docs * Moved to errors * Handle classmethods, properties * Use our AbstractMethodError * Lint * Cleanup * Move ndim validation to a method. * Try this * Make ExtensionBlock._holder a property Removed ExtensionBlock.__init__ * Make _holder a property for all * Refactored validate_ndim * fixup! Refactored validate_ndim * lint
1 parent 69cd5fb commit e8620ab

File tree

14 files changed

+803
-103
lines changed

14 files changed

+803
-103
lines changed

pandas/core/arrays/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
from .base import ExtensionArray # noqa
12
from .categorical import Categorical # noqa

pandas/core/arrays/base.py

+247
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
"""An interface for extending pandas with custom arrays."""
2+
from pandas.errors import AbstractMethodError
3+
4+
_not_implemented_message = "{} does not implement {}."
5+
6+
7+
class ExtensionArray(object):
8+
"""Abstract base class for custom 1-D array types.
9+
10+
pandas will recognize instances of this class as proper arrays
11+
with a custom type and will not attempt to coerce them to objects. They
12+
may be stored directly inside a :class:`DataFrame` or :class:`Series`.
13+
14+
Notes
15+
-----
16+
The interface includes the following abstract methods that must be
17+
implemented by subclasses:
18+
19+
* __getitem__
20+
* __len__
21+
* dtype
22+
* nbytes
23+
* isna
24+
* take
25+
* copy
26+
* _formatting_values
27+
* _concat_same_type
28+
29+
Some additional methods are required to satisfy pandas' internal, private
30+
block API.
31+
32+
* _concat_same_type
33+
* _can_hold_na
34+
35+
This class does not inherit from 'abc.ABCMeta' for performance reasons.
36+
Methods and properties required by the interface raise
37+
``pandas.errors.AbstractMethodError`` and no ``register`` method is
38+
provided for registering virtual subclasses.
39+
40+
ExtensionArrays are limited to 1 dimension.
41+
42+
They may be backed by none, one, or many NumPy ararys. For example,
43+
``pandas.Categorical`` is an extension array backed by two arrays,
44+
one for codes and one for categories. An array of IPv6 address may
45+
be backed by a NumPy structured array with two fields, one for the
46+
lower 64 bits and one for the upper 64 bits. Or they may be backed
47+
by some other storage type, like Python lists. Pandas makes no
48+
assumptions on how the data are stored, just that it can be converted
49+
to a NumPy array.
50+
51+
Extension arrays should be able to be constructed with instances of
52+
the class, i.e. ``ExtensionArray(extension_array)`` should return
53+
an instance, not error.
54+
55+
Additionally, certain methods and interfaces are required for proper
56+
this array to be properly stored inside a ``DataFrame`` or ``Series``.
57+
"""
58+
# ------------------------------------------------------------------------
59+
# Must be a Sequence
60+
# ------------------------------------------------------------------------
61+
def __getitem__(self, item):
62+
# type (Any) -> Any
63+
"""Select a subset of self.
64+
65+
Parameters
66+
----------
67+
item : int, slice, or ndarray
68+
* int: The position in 'self' to get.
69+
70+
* slice: A slice object, where 'start', 'stop', and 'step' are
71+
integers or None
72+
73+
* ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
74+
75+
Returns
76+
-------
77+
item : scalar or ExtensionArray
78+
79+
Notes
80+
-----
81+
For scalar ``item``, return a scalar value suitable for the array's
82+
type. This should be an instance of ``self.dtype.type``.
83+
84+
For slice ``key``, return an instance of ``ExtensionArray``, even
85+
if the slice is length 0 or 1.
86+
87+
For a boolean mask, return an instance of ``ExtensionArray``, filtered
88+
to the values where ``item`` is True.
89+
"""
90+
raise AbstractMethodError(self)
91+
92+
def __setitem__(self, key, value):
93+
# type: (Any, Any) -> None
94+
raise NotImplementedError(_not_implemented_message.format(
95+
type(self), '__setitem__')
96+
)
97+
98+
def __len__(self):
99+
"""Length of this array
100+
101+
Returns
102+
-------
103+
length : int
104+
"""
105+
# type: () -> int
106+
raise AbstractMethodError(self)
107+
108+
# ------------------------------------------------------------------------
109+
# Required attributes
110+
# ------------------------------------------------------------------------
111+
@property
112+
def dtype(self):
113+
# type: () -> ExtensionDtype
114+
"""An instance of 'ExtensionDtype'."""
115+
raise AbstractMethodError(self)
116+
117+
@property
118+
def shape(self):
119+
# type: () -> Tuple[int, ...]
120+
return (len(self),)
121+
122+
@property
123+
def ndim(self):
124+
# type: () -> int
125+
"""Extension Arrays are only allowed to be 1-dimensional."""
126+
return 1
127+
128+
@property
129+
def nbytes(self):
130+
# type: () -> int
131+
"""The number of bytes needed to store this object in memory.
132+
133+
If this is expensive to compute, return an approximate lower bound
134+
on the number of bytes needed.
135+
"""
136+
raise AbstractMethodError(self)
137+
138+
# ------------------------------------------------------------------------
139+
# Additional Methods
140+
# ------------------------------------------------------------------------
141+
def isna(self):
142+
# type: () -> np.ndarray
143+
"""Boolean NumPy array indicating if each value is missing.
144+
145+
This should return a 1-D array the same length as 'self'.
146+
"""
147+
raise AbstractMethodError(self)
148+
149+
# ------------------------------------------------------------------------
150+
# Indexing methods
151+
# ------------------------------------------------------------------------
152+
def take(self, indexer, allow_fill=True, fill_value=None):
153+
# type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
154+
"""Take elements from an array.
155+
156+
Parameters
157+
----------
158+
indexer : sequence of integers
159+
indices to be taken. -1 is used to indicate values
160+
that are missing.
161+
allow_fill : bool, default True
162+
If False, indexer is assumed to contain no -1 values so no filling
163+
will be done. This short-circuits computation of a mask. Result is
164+
undefined if allow_fill == False and -1 is present in indexer.
165+
fill_value : any, default None
166+
Fill value to replace -1 values with. By default, this uses
167+
the missing value sentinel for this type, ``self._fill_value``.
168+
169+
Notes
170+
-----
171+
This should follow pandas' semantics where -1 indicates missing values.
172+
Positions where indexer is ``-1`` should be filled with the missing
173+
value for this type.
174+
175+
This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
176+
indexer is a sequence of values.
177+
178+
Examples
179+
--------
180+
Suppose the extension array somehow backed by a NumPy structured array
181+
and that the underlying structured array is stored as ``self.data``.
182+
Then ``take`` may be written as
183+
184+
.. code-block:: python
185+
186+
def take(self, indexer, allow_fill=True, fill_value=None):
187+
mask = indexer == -1
188+
result = self.data.take(indexer)
189+
result[mask] = self._fill_value
190+
return type(self)(result)
191+
"""
192+
raise AbstractMethodError(self)
193+
194+
def copy(self, deep=False):
195+
# type: (bool) -> ExtensionArray
196+
"""Return a copy of the array.
197+
198+
Parameters
199+
----------
200+
deep : bool, default False
201+
Also copy the underlying data backing this array.
202+
203+
Returns
204+
-------
205+
ExtensionArray
206+
"""
207+
raise AbstractMethodError(self)
208+
209+
# ------------------------------------------------------------------------
210+
# Block-related methods
211+
# ------------------------------------------------------------------------
212+
@property
213+
def _fill_value(self):
214+
# type: () -> Any
215+
"""The missing value for this type, e.g. np.nan"""
216+
return None
217+
218+
def _formatting_values(self):
219+
# type: () -> np.ndarray
220+
# At the moment, this has to be an array since we use result.dtype
221+
"""An array of values to be printed in, e.g. the Series repr"""
222+
raise AbstractMethodError(self)
223+
224+
@classmethod
225+
def _concat_same_type(cls, to_concat):
226+
# type: (Sequence[ExtensionArray]) -> ExtensionArray
227+
"""Concatenate multiple array
228+
229+
Parameters
230+
----------
231+
to_concat : sequence of this type
232+
233+
Returns
234+
-------
235+
ExtensionArray
236+
"""
237+
raise AbstractMethodError(cls)
238+
239+
def _can_hold_na(self):
240+
# type: () -> bool
241+
"""Whether your array can hold missing values. True by default.
242+
243+
Notes
244+
-----
245+
Setting this to false will optimize some operations like fillna.
246+
"""
247+
return True

pandas/core/arrays/categorical.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
from pandas.util._validators import validate_bool_kwarg
4444
from pandas.core.config import get_option
4545

46+
from .base import ExtensionArray
47+
4648

4749
def _cat_compare_op(op):
4850
def f(self, other):
@@ -148,7 +150,7 @@ def _maybe_to_categorical(array):
148150
"""
149151

150152

151-
class Categorical(PandasObject):
153+
class Categorical(ExtensionArray, PandasObject):
152154
"""
153155
Represents a categorical variable in classic R / S-plus fashion
154156
@@ -2130,6 +2132,20 @@ def repeat(self, repeats, *args, **kwargs):
21302132
return self._constructor(values=codes, categories=self.categories,
21312133
ordered=self.ordered, fastpath=True)
21322134

2135+
# Implement the ExtensionArray interface
2136+
@property
2137+
def _can_hold_na(self):
2138+
return True
2139+
2140+
@classmethod
2141+
def _concat_same_type(self, to_concat):
2142+
from pandas.core.dtypes.concat import _concat_categorical
2143+
2144+
return _concat_categorical(to_concat)
2145+
2146+
def _formatting_values(self):
2147+
return self
2148+
21332149
# The Series.cat accessor
21342150

21352151

pandas/core/common.py

+2-14
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525

2626
# compat
2727
from pandas.errors import ( # noqa
28-
PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError)
28+
PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError,
29+
AbstractMethodError)
2930

3031
# back-compat of public API
3132
# deprecate these functions
@@ -88,19 +89,6 @@ class SettingWithCopyWarning(Warning):
8889
pass
8990

9091

91-
class AbstractMethodError(NotImplementedError):
92-
"""Raise this error instead of NotImplementedError for abstract methods
93-
while keeping compatibility with Python 2 and Python 3.
94-
"""
95-
96-
def __init__(self, class_instance):
97-
self.class_instance = class_instance
98-
99-
def __str__(self):
100-
msg = "This method must be defined in the concrete class of {name}"
101-
return (msg.format(name=self.class_instance.__class__.__name__))
102-
103-
10492
def flatten(l):
10593
"""Flatten an arbitrarily nested sequence.
10694

0 commit comments

Comments
 (0)