Skip to content

Commit a9e0972

Browse files
committed
REF: Define extension base classes
1 parent 4b06ae4 commit a9e0972

File tree

4 files changed

+295
-8
lines changed

4 files changed

+295
-8
lines changed

pandas/core/arrays/base.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
"""An interface for extending pandas with custom arrays."""
2+
import abc
3+
from typing import Tuple, Sequence, Optional, Any # noqa
4+
5+
import numpy as np
6+
7+
_not_implemented_message = "{} does not implement {}."
8+
9+
10+
class ExtensionArray(metaclass=abc.ABCMeta):
11+
"""Abstract base class for custom array types
12+
13+
pandas will recognize instances of this class as proper arrays
14+
with a custom type and will not attempt to coerce them to objects.
15+
16+
Subclasses are expected to implement the following methods.
17+
"""
18+
# ------------------------------------------------------------------------
19+
# Must be a Sequence
20+
# ------------------------------------------------------------------------
21+
@abc.abstractmethod
22+
def __getitem__(self, item):
23+
pass
24+
25+
def __setitem__(self, key, value):
26+
raise NotImplementedError(_not_implemented_message.format(
27+
type(self), '__setitem__')
28+
)
29+
30+
@abc.abstractmethod
31+
def __iter__(self):
32+
pass
33+
34+
@abc.abstractmethod
35+
def __len__(self):
36+
pass
37+
38+
# ------------------------------------------------------------------------
39+
# Required attributes
40+
# ------------------------------------------------------------------------
41+
@property
42+
@abc.abstractmethod
43+
def dtype(self):
44+
# type: () -> ExtensionDtype
45+
pass
46+
47+
@property
48+
def shape(self):
49+
# type: () -> Tuple[int, ...]
50+
return (len(self),)
51+
52+
@property
53+
def ndim(self):
54+
# type: () -> int
55+
"""Extension Arrays are only allowed to be 1-dimensional"""
56+
return 1
57+
58+
@property
59+
@abc.abstractmethod
60+
def nbytes(self):
61+
# type: () -> int
62+
# TODO: default impl?
63+
pass
64+
65+
# ------------------------------------------------------------------------
66+
# Additional Methods
67+
# ------------------------------------------------------------------------
68+
@abc.abstractmethod
69+
def isna(self):
70+
# type: () -> Sequence[bool]
71+
# TODO: narrow this type?
72+
pass
73+
74+
# ------------------------------------------------------------------------
75+
# Indexing methods
76+
# ------------------------------------------------------------------------
77+
@abc.abstractmethod
78+
def take(self, indexer, allow_fill=True, fill_value=None):
79+
# type: (Sequence, bool, Optional[Any]) -> ExtensionArray
80+
"""For slicing"""
81+
82+
@abc.abstractmethod
83+
def take_nd(self, indexer, allow_fill=True, fill_value=None):
84+
"""For slicing"""
85+
# TODO: this isn't nescesary if we only allow 1D (though maybe
86+
# impelment it).
87+
88+
@abc.abstractmethod
89+
def copy(self, deep=False):
90+
# type: (bool) -> ExtensionArray
91+
"""Return a copy of the array."""
92+
93+
# ------------------------------------------------------------------------
94+
# Block-related methods
95+
# ------------------------------------------------------------------------
96+
@property
97+
def _fill_value(self):
98+
"""The missing value for this type, e.g. np.nan"""
99+
# type: () -> Any
100+
return None
101+
102+
@abc.abstractmethod
103+
def _formatting_values(self):
104+
# type: () -> np.ndarray
105+
# At the moment, this has to be an array since we use result.dtype
106+
"""An array of values to be printed in, e.g. the Series repr"""
107+
108+
@classmethod
109+
@abc.abstractmethod
110+
def _concat_same_type(cls, to_concat):
111+
# type: (Sequence[ExtensionArray]) -> ExtensionArray
112+
"""Concatenate multiple array
113+
114+
Parameters
115+
----------
116+
to_concat : sequence of this type
117+
118+
Returns
119+
-------
120+
ExtensionArray
121+
"""
122+
123+
@abc.abstractmethod
124+
def get_values(self):
125+
# type: () -> np.ndarray
126+
"""Get the underlying values backing your data
127+
"""
128+
pass
129+
130+
def _can_hold_na(self):
131+
"""Whether your array can hold missing values. True by default.
132+
133+
Notes
134+
-----
135+
Setting this to false will optimize some operations like fillna.
136+
"""
137+
# type: () -> bool
138+
return True
139+
140+
@property
141+
def is_sparse(self):
142+
"""Whether your array is sparse. True by default."""
143+
# type: () -> bool
144+
return False
145+
146+
@abc.abstractmethod
147+
def _slice(self, slicer):
148+
# type: (Union[tuple, Sequence, int]) -> 'ExtensionArray'
149+
"""Return a new array sliced by `slicer`.
150+
151+
Parameters
152+
----------
153+
slicer : slice or np.ndarray
154+
If an array, it should just be a boolean mask
155+
156+
Returns
157+
-------
158+
array : ExtensionArray
159+
Should return an ExtensionArray, even if ``self[slicer]``
160+
would return a scalar.
161+
"""
162+
# XXX: We could get rid of this *if* we require that
163+
# ExtensionArray(extension_array[x]) always work.
164+
# That seems fine for when extension_array[x] is an ExtensionArray
165+
# but what if extension_array[x] reduces dimensionality?
166+
167+
def value_counts(self, dropna=True):
168+
"""Optional method for computing the histogram of the counts.
169+
170+
Parameters
171+
----------
172+
dropna : bool, default True
173+
whether to exclude missing values from the computation
174+
175+
Returns
176+
-------
177+
counts : Series
178+
"""
179+
from pandas.core.algorithms import value_counts
180+
mask = ~np.asarray(self.isna())
181+
values = self[mask] # XXX: this imposes boolean indexing
182+
return value_counts(np.asarray(values), dropna=dropna)

pandas/core/arrays/categorical.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
from pandas.util._validators import validate_bool_kwarg
4545
from pandas.core.config import get_option
4646

47+
from .base import ExtensionArray
48+
4749

4850
def _cat_compare_op(op):
4951
def f(self, other):
@@ -149,7 +151,7 @@ def _maybe_to_categorical(array):
149151
"""
150152

151153

152-
class Categorical(PandasObject):
154+
class Categorical(ExtensionArray, PandasObject):
153155
"""
154156
Represents a categorical variable in classic R / S-plus fashion
155157
@@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs):
21312133
return self._constructor(values=codes, categories=self.categories,
21322134
ordered=self.ordered, fastpath=True)
21332135

2136+
# Interface things
2137+
# can_hold_na, concat_same_type, formatting_values
2138+
@property
2139+
def _can_hold_na(self):
2140+
return True
2141+
2142+
@classmethod
2143+
def _concat_same_type(self, to_concat):
2144+
from pandas.types.concat import union_categoricals
2145+
return union_categoricals(to_concat)
2146+
2147+
def _formatting_values(self):
2148+
return self
2149+
21342150
# The Series.cat accessor
21352151

21362152

pandas/core/dtypes/base.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Extend pandas with custom array types"""
2+
import abc
3+
4+
5+
class ExtensionDtype(metaclass=abc.ABCMeta):
6+
"""A custom data type for your array.
7+
"""
8+
@property
9+
def type(self):
10+
"""Typically a metaclass inheriting from 'type' with no methods."""
11+
return type(self.name, (), {})
12+
13+
@property
14+
def kind(self):
15+
"""A character code (one of 'biufcmMOSUV'), default 'O'
16+
17+
See Also
18+
--------
19+
numpy.dtype.kind
20+
"""
21+
return 'O'
22+
23+
@property
24+
@abc.abstractmethod
25+
def name(self):
26+
"""An string identifying the data type.
27+
28+
Will be used in, e.g. ``Series.dtype``
29+
"""
30+
31+
@property
32+
def names(self):
33+
"""Ordered list of field names, or None if there are no fields"""
34+
return None
35+
36+
@classmethod
37+
def construct_from_string(cls, string):
38+
"""Attempt to construct this type from a string.
39+
40+
Parameters
41+
----------
42+
string : str
43+
44+
Returns
45+
-------
46+
self : instance of 'cls'
47+
48+
Raises
49+
------
50+
TypeError
51+
52+
Notes
53+
-----
54+
The default implementation checks if 'string' matches your
55+
type's name. If so, it calls your class with no arguments.
56+
"""
57+
if string == cls.name:
58+
return cls()
59+
else:
60+
raise TypeError("Cannot construct a '{}' from "
61+
"'{}'".format(cls, string))
62+
63+
@classmethod
64+
def is_dtype(cls, dtype):
65+
"""Check if we match 'dtype'
66+
67+
Parameters
68+
----------
69+
dtype : str or dtype
70+
71+
Returns
72+
-------
73+
is_dtype : bool
74+
75+
Notes
76+
-----
77+
The default implementation is True if
78+
79+
1. 'dtype' is a string that returns true for
80+
``cls.construct_from_string``
81+
2. 'dtype' is ``cls`` or a subclass of ``cls``.
82+
"""
83+
if isinstance(dtype, str):
84+
try:
85+
return isinstance(cls.construct_from_string(dtype), cls)
86+
except TypeError:
87+
return False
88+
else:
89+
return issubclass(dtype, cls)

pandas/core/dtypes/dtypes.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
from pandas import compat
66
from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex
77

8+
from .base import ExtensionDtype
89

9-
class ExtensionDtype(object):
10+
11+
class PandasExtensionDtype(ExtensionDtype):
1012
"""
1113
A np.dtype duck-typed class, suitable for holding a custom dtype.
1214
1315
THIS IS NOT A REAL NUMPY DTYPE
1416
"""
15-
name = None
16-
names = None
1717
type = None
1818
subdtype = None
1919
kind = None
@@ -108,7 +108,7 @@ class CategoricalDtypeType(type):
108108
pass
109109

110110

111-
class CategoricalDtype(ExtensionDtype):
111+
class CategoricalDtype(PandasExtensionDtype):
112112
"""
113113
Type for categorical data with the categories and orderedness
114114
@@ -387,7 +387,7 @@ class DatetimeTZDtypeType(type):
387387
pass
388388

389389

390-
class DatetimeTZDtype(ExtensionDtype):
390+
class DatetimeTZDtype(PandasExtensionDtype):
391391

392392
"""
393393
A np.dtype duck-typed class, suitable for holding a custom datetime with tz
@@ -501,7 +501,7 @@ class PeriodDtypeType(type):
501501
pass
502502

503503

504-
class PeriodDtype(ExtensionDtype):
504+
class PeriodDtype(PandasExtensionDtype):
505505
__metaclass__ = PeriodDtypeType
506506
"""
507507
A Period duck-typed class, suitable for holding a period with freq dtype.
@@ -619,7 +619,7 @@ class IntervalDtypeType(type):
619619
pass
620620

621621

622-
class IntervalDtype(ExtensionDtype):
622+
class IntervalDtype(PandasExtensionDtype):
623623
__metaclass__ = IntervalDtypeType
624624
"""
625625
A Interval duck-typed class, suitable for holding an interval

0 commit comments

Comments
 (0)