Skip to content

Commit df38f66

Browse files
REF: Internal / External values (#19558)
* REF/Clean: Internal / External values * Move to index base * Cleanup unique handling * Simplify object concat * Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. * hmm * Additional testing * More tests * ndarray_values * API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a06) * Simplify concat_as_object * Py2 compat (cherry picked from commit b20e12c) * Set-ops ugliness * better docstrings * tolist * linting * Moved dtypes (cherry picked from commit d136227) * clean * cleanup * NumPy compat * Use base _values for CategoricalIndex * Update dev docs * cleanup * Linting * Precision in tests * Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. * Clean up tolist * Move test locations * Fixed test * REF: Update per comments * lint * REF: Use _values for size and shape * PERF: Implement size, shape for IntervalIndex * PERF: Avoid materializing values for PeriodIndex shape, size * Cleanup * Override nbytes
1 parent d9551c8 commit df38f66

25 files changed

+386
-85
lines changed

doc/source/internals.rst

+19
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,25 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
8989
constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
9090
if you compute the levels and labels yourself, please be careful.
9191

92+
Values
93+
~~~~~~
94+
95+
Pandas extends NumPy's type system with custom types, like ``Categorical`` or
96+
datetimes with a timezone, so we have multiple notions of "values". For 1-D
97+
containers (``Index`` classes and ``Series``) we have the following convention:
98+
99+
* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally,
100+
``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``,
101+
this returns the codes, not the array of objects.
102+
* ``cls._values`` refers is the "best possible" array. This could be an
103+
``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the
104+
process of removing the index subclasses here so that it's always an
105+
``ndarray`` or ``ExtensionArray``).
106+
107+
So, for example, ``Series[category]._values`` is a ``Categorical``, while
108+
``Series[category]._ndarray_values`` is the underlying codes.
109+
110+
92111
.. _ref-subclassing-pandas:
93112

94113
Subclassing pandas Data Structures

pandas/core/arrays/base.py

+12
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,15 @@ def _can_hold_na(self):
266266
Setting this to false will optimize some operations like fillna.
267267
"""
268268
return True
269+
270+
@property
271+
def _ndarray_values(self):
272+
# type: () -> np.ndarray
273+
"""Internal pandas method for lossy conversion to a NumPy ndarray.
274+
275+
This method is not part of the pandas interface.
276+
277+
The expectation is that this is cheap to compute, and is primarily
278+
used for interacting with our indexers.
279+
"""
280+
return np.array(self)

pandas/core/arrays/categorical.py

+4
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,10 @@ def dtype(self):
410410
"""The :class:`~pandas.api.types.CategoricalDtype` for this instance"""
411411
return self._dtype
412412

413+
@property
414+
def _ndarray_values(self):
415+
return self.codes
416+
413417
@property
414418
def _constructor(self):
415419
return Categorical

pandas/core/base.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
is_list_like,
1414
is_scalar,
1515
is_datetimelike,
16-
is_extension_type)
16+
is_extension_type,
17+
is_extension_array_dtype)
1718

1819
from pandas.util._validators import validate_bool_kwarg
1920

@@ -738,7 +739,7 @@ def data(self):
738739
@property
739740
def itemsize(self):
740741
""" return the size of the dtype of the item of the underlying data """
741-
return self._values.itemsize
742+
return self._ndarray_values.itemsize
742743

743744
@property
744745
def nbytes(self):
@@ -748,7 +749,7 @@ def nbytes(self):
748749
@property
749750
def strides(self):
750751
""" return the strides of the underlying data """
751-
return self._values.strides
752+
return self._ndarray_values.strides
752753

753754
@property
754755
def size(self):
@@ -768,8 +769,17 @@ def base(self):
768769
return self.values.base
769770

770771
@property
771-
def _values(self):
772-
""" the internal implementation """
772+
def _ndarray_values(self):
773+
"""The data as an ndarray, possibly losing information.
774+
775+
The expectation is that this is cheap to compute, and is primarily
776+
used for interacting with our indexers.
777+
778+
- categorical -> codes
779+
"""
780+
# type: () -> np.ndarray
781+
if is_extension_array_dtype(self):
782+
return self.values._ndarray_values
773783
return self.values
774784

775785
@property
@@ -979,6 +989,7 @@ def unique(self):
979989
values = self._values
980990

981991
if hasattr(values, 'unique'):
992+
982993
result = values.unique()
983994
else:
984995
from pandas.core.algorithms import unique1d

pandas/core/dtypes/cast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ def try_timedelta(v):
927927
# will try first with a string & object conversion
928928
from pandas import to_timedelta
929929
try:
930-
return to_timedelta(v)._values.reshape(shape)
930+
return to_timedelta(v)._ndarray_values.reshape(shape)
931931
except Exception:
932932
return v.reshape(shape)
933933

pandas/core/dtypes/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype):
17091709
from pandas.core.arrays import ExtensionArray
17101710

17111711
# we want to unpack series, anything else?
1712-
if isinstance(arr_or_dtype, ABCSeries):
1712+
if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)):
17131713
arr_or_dtype = arr_or_dtype._values
17141714
return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
17151715

pandas/core/dtypes/concat.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -488,12 +488,14 @@ def _concat_index_asobject(to_concat, name=None):
488488
concat all inputs as object. DatetimeIndex, TimedeltaIndex and
489489
PeriodIndex are converted to object dtype before concatenation
490490
"""
491+
from pandas import Index
492+
from pandas.core.arrays import ExtensionArray
491493

492-
klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex
494+
klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
495+
ExtensionArray)
493496
to_concat = [x.astype(object) if isinstance(x, klasses) else x
494497
for x in to_concat]
495498

496-
from pandas import Index
497499
self = to_concat[0]
498500
attribs = self._get_attributes_dict()
499501
attribs['name'] = name

pandas/core/indexes/base.py

+83-25
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,14 @@
3131
is_object_dtype,
3232
is_categorical_dtype,
3333
is_interval_dtype,
34+
is_period_dtype,
3435
is_bool,
3536
is_bool_dtype,
3637
is_signed_integer_dtype,
3738
is_unsigned_integer_dtype,
3839
is_integer_dtype, is_float_dtype,
3940
is_datetime64_any_dtype,
41+
is_datetime64tz_dtype,
4042
is_timedelta64_dtype,
4143
needs_i8_conversion,
4244
is_iterator, is_list_like,
@@ -412,7 +414,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
412414
values = np.array(values, copy=False)
413415
if is_object_dtype(values):
414416
values = cls(values, name=name, dtype=dtype,
415-
**kwargs)._values
417+
**kwargs)._ndarray_values
416418

417419
result = object.__new__(cls)
418420
result._data = values
@@ -594,6 +596,40 @@ def values(self):
594596
""" return the underlying data as an ndarray """
595597
return self._data.view(np.ndarray)
596598

599+
@property
600+
def _values(self):
601+
# type: () -> Union[ExtensionArray, Index]
602+
# TODO(EA): remove index types as they become extension arrays
603+
"""The best array representation.
604+
605+
This is an ndarray, ExtensionArray, or Index subclass. This differs
606+
from ``_ndarray_values``, which always returns an ndarray.
607+
608+
Both ``_values`` and ``_ndarray_values`` are consistent between
609+
``Series`` and ``Index``.
610+
611+
It may differ from the public '.values' method.
612+
613+
index | values | _values | _ndarray_values |
614+
----------------- | -------------- -| ----------- | --------------- |
615+
CategoricalIndex | Categorical | Categorical | codes |
616+
DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] |
617+
618+
For the following, the ``._values`` is currently ``ndarray[object]``,
619+
but will soon be an ``ExtensionArray``
620+
621+
index | values | _values | _ndarray_values |
622+
----------------- | --------------- | ------------ | --------------- |
623+
PeriodIndex | ndarray[object] | ndarray[obj] | ndarray[int] |
624+
IntervalIndex | ndarray[object] | ndarray[obj] | ndarray[object] |
625+
626+
See Also
627+
--------
628+
values
629+
_ndarray_values
630+
"""
631+
return self.values
632+
597633
def get_values(self):
598634
""" return the underlying data as an ndarray """
599635
return self.values
@@ -664,7 +700,7 @@ def ravel(self, order='C'):
664700
--------
665701
numpy.ndarray.ravel
666702
"""
667-
return self._values.ravel(order=order)
703+
return self._ndarray_values.ravel(order=order)
668704

669705
# construction helpers
670706
@classmethod
@@ -1597,7 +1633,7 @@ def _constructor(self):
15971633
@cache_readonly
15981634
def _engine(self):
15991635
# property, for now, slow to look up
1600-
return self._engine_type(lambda: self._values, len(self))
1636+
return self._engine_type(lambda: self._ndarray_values, len(self))
16011637

16021638
def _validate_index_level(self, level):
16031639
"""
@@ -2228,27 +2264,37 @@ def union(self, other):
22282264
other = other.astype('O')
22292265
return this.union(other)
22302266

2267+
# TODO(EA): setops-refactor, clean all this up
2268+
if is_period_dtype(self) or is_datetime64tz_dtype(self):
2269+
lvals = self._ndarray_values
2270+
else:
2271+
lvals = self._values
2272+
if is_period_dtype(other) or is_datetime64tz_dtype(other):
2273+
rvals = other._ndarray_values
2274+
else:
2275+
rvals = other._values
2276+
22312277
if self.is_monotonic and other.is_monotonic:
22322278
try:
2233-
result = self._outer_indexer(self._values, other._values)[0]
2279+
result = self._outer_indexer(lvals, rvals)[0]
22342280
except TypeError:
22352281
# incomparable objects
2236-
result = list(self._values)
2282+
result = list(lvals)
22372283

22382284
# worth making this faster? a very unusual case
2239-
value_set = set(self._values)
2240-
result.extend([x for x in other._values if x not in value_set])
2285+
value_set = set(lvals)
2286+
result.extend([x for x in rvals if x not in value_set])
22412287
else:
22422288
indexer = self.get_indexer(other)
22432289
indexer, = (indexer == -1).nonzero()
22442290

22452291
if len(indexer) > 0:
2246-
other_diff = algos.take_nd(other._values, indexer,
2292+
other_diff = algos.take_nd(rvals, indexer,
22472293
allow_fill=False)
2248-
result = _concat._concat_compat((self._values, other_diff))
2294+
result = _concat._concat_compat((lvals, other_diff))
22492295

22502296
try:
2251-
self._values[0] < other_diff[0]
2297+
lvals[0] < other_diff[0]
22522298
except TypeError as e:
22532299
warnings.warn("%s, sort order is undefined for "
22542300
"incomparable objects" % e, RuntimeWarning,
@@ -2260,7 +2306,7 @@ def union(self, other):
22602306
result.sort()
22612307

22622308
else:
2263-
result = self._values
2309+
result = lvals
22642310

22652311
try:
22662312
result = np.sort(result)
@@ -2311,20 +2357,30 @@ def intersection(self, other):
23112357
other = other.astype('O')
23122358
return this.intersection(other)
23132359

2360+
# TODO(EA): setops-refactor, clean all this up
2361+
if is_period_dtype(self):
2362+
lvals = self._ndarray_values
2363+
else:
2364+
lvals = self._values
2365+
if is_period_dtype(other):
2366+
rvals = other._ndarray_values
2367+
else:
2368+
rvals = other._values
2369+
23142370
if self.is_monotonic and other.is_monotonic:
23152371
try:
2316-
result = self._inner_indexer(self._values, other._values)[0]
2372+
result = self._inner_indexer(lvals, rvals)[0]
23172373
return self._wrap_union_result(other, result)
23182374
except TypeError:
23192375
pass
23202376

23212377
try:
2322-
indexer = Index(other._values).get_indexer(self._values)
2378+
indexer = Index(rvals).get_indexer(lvals)
23232379
indexer = indexer.take((indexer != -1).nonzero()[0])
23242380
except Exception:
23252381
# duplicates
23262382
indexer = algos.unique1d(
2327-
Index(other._values).get_indexer_non_unique(self._values)[0])
2383+
Index(rvals).get_indexer_non_unique(lvals)[0])
23282384
indexer = indexer[indexer != -1]
23292385

23302386
taken = other.take(indexer)
@@ -2700,7 +2756,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
27002756
raise ValueError('limit argument only valid if doing pad, '
27012757
'backfill or nearest reindexing')
27022758

2703-
indexer = self._engine.get_indexer(target._values)
2759+
indexer = self._engine.get_indexer(target._ndarray_values)
27042760

27052761
return _ensure_platform_int(indexer)
27062762

@@ -2716,12 +2772,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
27162772
if self.is_monotonic_increasing and target.is_monotonic_increasing:
27172773
method = (self._engine.get_pad_indexer if method == 'pad' else
27182774
self._engine.get_backfill_indexer)
2719-
indexer = method(target._values, limit)
2775+
indexer = method(target._ndarray_values, limit)
27202776
else:
27212777
indexer = self._get_fill_indexer_searchsorted(target, method,
27222778
limit)
27232779
if tolerance is not None:
2724-
indexer = self._filter_indexer_tolerance(target._values, indexer,
2780+
indexer = self._filter_indexer_tolerance(target._ndarray_values,
2781+
indexer,
27252782
tolerance)
27262783
return indexer
27272784

@@ -2812,7 +2869,7 @@ def get_indexer_non_unique(self, target):
28122869
self = Index(self.asi8)
28132870
tgt_values = target.asi8
28142871
else:
2815-
tgt_values = target._values
2872+
tgt_values = target._ndarray_values
28162873

28172874
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
28182875
return _ensure_platform_int(indexer), missing
@@ -3247,16 +3304,17 @@ def _join_multi(self, other, how, return_indexers=True):
32473304
def _join_non_unique(self, other, how='left', return_indexers=False):
32483305
from pandas.core.reshape.merge import _get_join_indexers
32493306

3250-
left_idx, right_idx = _get_join_indexers([self._values],
3251-
[other._values], how=how,
3307+
left_idx, right_idx = _get_join_indexers([self._ndarray_values],
3308+
[other._ndarray_values],
3309+
how=how,
32523310
sort=True)
32533311

32543312
left_idx = _ensure_platform_int(left_idx)
32553313
right_idx = _ensure_platform_int(right_idx)
32563314

3257-
join_index = np.asarray(self._values.take(left_idx))
3315+
join_index = np.asarray(self._ndarray_values.take(left_idx))
32583316
mask = left_idx == -1
3259-
np.putmask(join_index, mask, other._values.take(right_idx))
3317+
np.putmask(join_index, mask, other._ndarray_values.take(right_idx))
32603318

32613319
join_index = self._wrap_joined_index(join_index, other)
32623320

@@ -3403,8 +3461,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
34033461
else:
34043462
return ret_index
34053463

3406-
sv = self._values
3407-
ov = other._values
3464+
sv = self._ndarray_values
3465+
ov = other._ndarray_values
34083466

34093467
if self.is_unique and other.is_unique:
34103468
# We can perform much better than the general case
@@ -3756,7 +3814,7 @@ def insert(self, loc, item):
37563814
item = self._na_value
37573815

37583816
_self = np.asarray(self)
3759-
item = self._coerce_scalar_to_index(item)._values
3817+
item = self._coerce_scalar_to_index(item)._ndarray_values
37603818
idx = np.concatenate((_self[:loc], item, _self[loc:]))
37613819
return self._shallow_copy_with_infer(idx)
37623820

0 commit comments

Comments
 (0)