Skip to content

Commit 2cbdd9a

Browse files
TomAugspurgerjorisvandenbossche
authored andcommitted
ExtensionArray.take default implementation (pandas-dev#20814)
Implements a take interface that's compatible with NumPy and optionally pandas' NA semantics. Closes pandas-dev#20640
1 parent 96f2f57 commit 2cbdd9a

File tree

19 files changed

+460
-81
lines changed

19 files changed

+460
-81
lines changed

pandas/api/extensions/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
from pandas.core.accessor import (register_dataframe_accessor, # noqa
33
register_index_accessor,
44
register_series_accessor)
5+
from pandas.core.algorithms import take # noqa
56
from pandas.core.arrays.base import ExtensionArray # noqa
67
from pandas.core.dtypes.dtypes import ExtensionDtype # noqa

pandas/core/algorithms.py

+89-1
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,94 @@ def func(arr, indexer, out, fill_value=np.nan):
14481448
return func
14491449

14501450

1451+
def take(arr, indices, allow_fill=False, fill_value=None):
1452+
"""
1453+
Take elements from an array.
1454+
1455+
.. versionadded:: 0.23.0
1456+
1457+
Parameters
1458+
----------
1459+
arr : sequence
1460+
Non array-likes (sequences without a dtype) are coereced
1461+
to an ndarray.
1462+
indices : sequence of integers
1463+
Indices to be taken.
1464+
allow_fill : bool, default False
1465+
How to handle negative values in `indices`.
1466+
1467+
* False: negative values in `indices` indicate positional indices
1468+
from the right (the default). This is similar to :func:`numpy.take`.
1469+
1470+
* True: negative values in `indices` indicate
1471+
missing values. These values are set to `fill_value`. Any other
1472+
other negative values raise a ``ValueError``.
1473+
1474+
fill_value : any, optional
1475+
Fill value to use for NA-indices when `allow_fill` is True.
1476+
This may be ``None``, in which case the default NA value for
1477+
the type (``self.dtype.na_value``) is used.
1478+
1479+
Returns
1480+
-------
1481+
ndarray or ExtensionArray
1482+
Same type as the input.
1483+
1484+
Raises
1485+
------
1486+
IndexError
1487+
When `indices` is out of bounds for the array.
1488+
ValueError
1489+
When the indexer contains negative values other than ``-1``
1490+
and `allow_fill` is True.
1491+
1492+
Notes
1493+
-----
1494+
When `allow_fill` is False, `indices` may be whatever dimensionality
1495+
is accepted by NumPy for `arr`.
1496+
1497+
When `allow_fill` is True, `indices` should be 1-D.
1498+
1499+
See Also
1500+
--------
1501+
numpy.take
1502+
1503+
Examples
1504+
--------
1505+
>>> from pandas.api.extensions import take
1506+
1507+
With the default ``allow_fill=False``, negative numbers indicate
1508+
positional indices from the right.
1509+
1510+
>>> take(np.array([10, 20, 30]), [0, 0, -1])
1511+
array([10, 10, 30])
1512+
1513+
Setting ``allow_fill=True`` will place `fill_value` in those positions.
1514+
1515+
>>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
1516+
array([10., 10., nan])
1517+
1518+
>>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
1519+
... fill_value=-10)
1520+
array([ 10, 10, -10])
1521+
"""
1522+
from pandas.core.indexing import validate_indices
1523+
1524+
if not is_array_like(arr):
1525+
arr = np.asarray(arr)
1526+
1527+
indices = np.asarray(indices, dtype=np.intp)
1528+
1529+
if allow_fill:
1530+
# Pandas style, -1 means NA
1531+
validate_indices(indices, len(arr))
1532+
result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value)
1533+
else:
1534+
# NumPy style
1535+
result = arr.take(indices)
1536+
return result
1537+
1538+
14511539
def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
14521540
allow_fill=True):
14531541
"""
@@ -1462,7 +1550,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
14621550
Input array.
14631551
indexer : ndarray
14641552
1-D array of indices to take, subarrays corresponding to -1 value
1465-
indicies are filed with fill_value
1553+
indices are filed with fill_value
14661554
axis : int, default 0
14671555
Axis to take from
14681556
out : ndarray or None, default None

pandas/core/arrays/base.py

+61-35
Original file line numberDiff line numberDiff line change
@@ -462,22 +462,36 @@ def factorize(self, na_sentinel=-1):
462462
# ------------------------------------------------------------------------
463463
# Indexing methods
464464
# ------------------------------------------------------------------------
465-
def take(self, indexer, allow_fill=True, fill_value=None):
465+
466+
def take(self, indices, allow_fill=False, fill_value=None):
466467
# type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
467468
"""Take elements from an array.
468469
469470
Parameters
470471
----------
471-
indexer : sequence of integers
472-
indices to be taken. -1 is used to indicate values
473-
that are missing.
474-
allow_fill : bool, default True
475-
If False, indexer is assumed to contain no -1 values so no filling
476-
will be done. This short-circuits computation of a mask. Result is
477-
undefined if allow_fill == False and -1 is present in indexer.
478-
fill_value : any, default None
479-
Fill value to replace -1 values with. If applicable, this should
480-
use the sentinel missing value for this type.
472+
indices : sequence of integers
473+
Indices to be taken.
474+
allow_fill : bool, default False
475+
How to handle negative values in `indices`.
476+
477+
* False: negative values in `indices` indicate positional indices
478+
from the right (the default). This is similar to
479+
:func:`numpy.take`.
480+
481+
* True: negative values in `indices` indicate
482+
missing values. These values are set to `fill_value`. Any other
483+
other negative values raise a ``ValueError``.
484+
485+
fill_value : any, optional
486+
Fill value to use for NA-indices when `allow_fill` is True.
487+
This may be ``None``, in which case the default NA value for
488+
the type, ``self.dtype.na_value``, is used.
489+
490+
For many ExtensionArrays, there will be two representations of
491+
`fill_value`: a user-facing "boxed" scalar, and a low-level
492+
physical NA value. `fill_value` should be the user-facing version,
493+
and the implementation should handle translating that to the
494+
physical version for processing the take if nescessary.
481495
482496
Returns
483497
-------
@@ -486,44 +500,56 @@ def take(self, indexer, allow_fill=True, fill_value=None):
486500
Raises
487501
------
488502
IndexError
489-
When the indexer is out of bounds for the array.
503+
When the indices are out of bounds for the array.
504+
ValueError
505+
When `indices` contains negative values other than ``-1``
506+
and `allow_fill` is True.
490507
491508
Notes
492509
-----
493-
This should follow pandas' semantics where -1 indicates missing values.
494-
Positions where indexer is ``-1`` should be filled with the missing
495-
value for this type.
496-
This gives rise to the special case of a take on an empty
497-
ExtensionArray that does not raises an IndexError straight away
498-
when the `indexer` is all ``-1``.
510+
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
511+
``iloc``, when `indices` is a sequence of values. Additionally,
512+
it's called by :meth:`Series.reindex`, or any other method
513+
that causes realignemnt, with a `fill_value`.
499514
500-
This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the
501-
indexer is a sequence of values.
515+
See Also
516+
--------
517+
numpy.take
518+
pandas.api.extensions.take
502519
503520
Examples
504521
--------
505-
Suppose the extension array is backed by a NumPy array stored as
506-
``self.data``. Then ``take`` may be written as
522+
Here's an example implementation, which relies on casting the
523+
extension array to object dtype. This uses the helper method
524+
:func:`pandas.api.extensions.take`.
507525
508526
.. code-block:: python
509527
510-
def take(self, indexer, allow_fill=True, fill_value=None):
511-
indexer = np.asarray(indexer)
512-
mask = indexer == -1
528+
def take(self, indices, allow_fill=False, fill_value=None):
529+
from pandas.core.algorithms import take
513530
514-
# take on empty array not handled as desired by numpy
515-
# in case of -1 (all missing take)
516-
if not len(self) and mask.all():
517-
return type(self)([np.nan] * len(indexer))
531+
# If the ExtensionArray is backed by an ndarray, then
532+
# just pass that here instead of coercing to object.
533+
data = self.astype(object)
518534
519-
result = self.data.take(indexer)
520-
result[mask] = np.nan # NA for this type
521-
return type(self)(result)
535+
if allow_fill and fill_value is None:
536+
fill_value = self.dtype.na_value
522537
523-
See Also
524-
--------
525-
numpy.take
538+
# fill value should always be translated from the scalar
539+
# type for the array, to the physical storage type for
540+
# the data, before passing to take.
541+
542+
result = take(data, indices, fill_value=fill_value,
543+
allow_fill=allow_fill)
544+
return self._from_sequence(result)
526545
"""
546+
# Implementer note: The `fill_value` parameter should be a user-facing
547+
# value, an instance of self.dtype.type. When passed `fill_value=None`,
548+
# the default of `self.dtype.na_value` should be used.
549+
# This may differ from the physical storage type your ExtensionArray
550+
# uses. In this case, your implementation is responsible for casting
551+
# the user-facing type to the storage type, before using
552+
# pandas.api.extensions.take
527553
raise AbstractMethodError(self)
528554

529555
def copy(self, deep=False):

pandas/core/dtypes/base.py

+11
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ class _DtypeOpsMixin(object):
1616
# classes will inherit from this Mixin. Once everything is compatible, this
1717
# class's methods can be moved to ExtensionDtype and removed.
1818

19+
# na_value is the default NA value to use for this type. This is used in
20+
# e.g. ExtensionArray.take. This should be the user-facing "boxed" version
21+
# of the NA value, not the physical NA vaalue for storage.
22+
# e.g. for JSONArray, this is an empty dictionary.
23+
na_value = np.nan
24+
1925
def __eq__(self, other):
2026
"""Check whether 'other' is equal to self.
2127
@@ -92,6 +98,8 @@ def is_dtype(cls, dtype):
9298
class ExtensionDtype(_DtypeOpsMixin):
9399
"""A custom data type, to be paired with an ExtensionArray.
94100
101+
.. versionadded:: 0.23.0
102+
95103
Notes
96104
-----
97105
The interface includes the following abstract methods that must
@@ -101,6 +109,9 @@ class ExtensionDtype(_DtypeOpsMixin):
101109
* name
102110
* construct_from_string
103111
112+
The `na_value` class attribute can be used to set the default NA value
113+
for this type. :attr:`numpy.nan` is used by default.
114+
104115
This class does not inherit from 'abc.ABCMeta' for performance reasons.
105116
Methods and properties required by the interface raise
106117
``pandas.errors.AbstractMethodError`` and no ``register`` method is

pandas/core/dtypes/cast.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ def changeit():
255255

256256

257257
def maybe_promote(dtype, fill_value=np.nan):
258-
259258
# if we passed an array here, determine the fill value by dtype
260259
if isinstance(fill_value, np.ndarray):
261260
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
@@ -294,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan):
294293
elif is_datetimetz(dtype):
295294
if isna(fill_value):
296295
fill_value = iNaT
296+
elif is_extension_array_dtype(dtype) and isna(fill_value):
297+
fill_value = dtype.na_value
297298
elif is_float(fill_value):
298299
if issubclass(dtype.type, np.bool_):
299300
dtype = np.object_

pandas/core/dtypes/missing.py

+2
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,8 @@ def na_value_for_dtype(dtype, compat=True):
502502
"""
503503
dtype = pandas_dtype(dtype)
504504

505+
if is_extension_array_dtype(dtype):
506+
return dtype.na_value
505507
if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
506508
is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):
507509
return NaT

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
34763476
allow_dups=False)
34773477

34783478
def _reindex_columns(self, new_columns, method, copy, level,
3479-
fill_value=np.nan, limit=None, tolerance=None):
3479+
fill_value=None, limit=None, tolerance=None):
34803480
new_columns, indexer = self.columns.reindex(new_columns, method=method,
34813481
level=level, limit=limit,
34823482
tolerance=tolerance)

pandas/core/generic.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs):
36603660
copy = kwargs.pop('copy', True)
36613661
limit = kwargs.pop('limit', None)
36623662
tolerance = kwargs.pop('tolerance', None)
3663-
fill_value = kwargs.pop('fill_value', np.nan)
3663+
fill_value = kwargs.pop('fill_value', None)
36643664

36653665
# Series.reindex doesn't use / need the axis kwarg
36663666
# We pop and ignore it here, to make writing Series/Frame generic code
@@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value):
37763776

37773777
@Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
37783778
def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
3779-
limit=None, fill_value=np.nan):
3779+
limit=None, fill_value=None):
37803780
msg = ("'.reindex_axis' is deprecated and will be removed in a future "
37813781
"version. Use '.reindex' instead.")
37823782
self._consolidate_inplace()
@@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
37903790
return self._reindex_with_indexers({axis: [new_index, indexer]},
37913791
fill_value=fill_value, copy=copy)
37923792

3793-
def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False,
3793+
def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
37943794
allow_dups=False):
37953795
"""allow_dups indicates an internal call here """
37963796

@@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
72527252
raise TypeError('unsupported type: %s' % type(other))
72537253

72547254
def _align_frame(self, other, join='outer', axis=None, level=None,
7255-
copy=True, fill_value=np.nan, method=None, limit=None,
7255+
copy=True, fill_value=None, method=None, limit=None,
72567256
fill_axis=0):
72577257
# defaults
72587258
join_index, join_columns = None, None

pandas/core/indexing.py

+41
Original file line numberDiff line numberDiff line change
@@ -2417,12 +2417,53 @@ def maybe_convert_indices(indices, n):
24172417
mask = indices < 0
24182418
if mask.any():
24192419
indices[mask] += n
2420+
24202421
mask = (indices >= n) | (indices < 0)
24212422
if mask.any():
24222423
raise IndexError("indices are out-of-bounds")
24232424
return indices
24242425

24252426

2427+
def validate_indices(indices, n):
2428+
"""Perform bounds-checking for an indexer.
2429+
2430+
-1 is allowed for indicating missing values.
2431+
2432+
Parameters
2433+
----------
2434+
indices : ndarray
2435+
n : int
2436+
length of the array being indexed
2437+
2438+
Raises
2439+
------
2440+
ValueError
2441+
2442+
Examples
2443+
--------
2444+
>>> validate_indices([1, 2], 3)
2445+
# OK
2446+
>>> validate_indices([1, -2], 3)
2447+
ValueError
2448+
>>> validate_indices([1, 2, 3], 3)
2449+
IndexError
2450+
>>> validate_indices([-1, -1], 0)
2451+
# OK
2452+
>>> validate_indices([0, 1], 0)
2453+
IndexError
2454+
"""
2455+
if len(indices):
2456+
min_idx = indices.min()
2457+
if min_idx < -1:
2458+
msg = ("'indices' contains values less than allowed ({} < {})"
2459+
.format(min_idx, -1))
2460+
raise ValueError(msg)
2461+
2462+
max_idx = indices.max()
2463+
if max_idx >= n:
2464+
raise IndexError("indices are out-of-bounds")
2465+
2466+
24262467
def maybe_convert_ix(*args):
24272468
"""
24282469
We likely want to take the cross-product

0 commit comments

Comments
 (0)