Skip to content

Commit 9788ad1

Browse files
committed
PERF: optimize index.__getitem__ for slice & boolean mask indexers
1 parent 6c3755b commit 9788ad1

File tree

8 files changed

+83
-31
lines changed

8 files changed

+83
-31
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ API Changes
105105
- ``NameResolutionError`` was removed because it isn't necessary anymore.
106106
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
107107
or numbering columns as needed (:issue:`2385`)
108+
- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
109+
longer change type of the resulting index (:issue:`6440`).
108110

109111
Experimental Features
110112
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.14.0.txt

+15
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,21 @@ These are out-of-bounds selections
7878
- ``NameResolutionError`` was removed because it isn't necessary anymore.
7979
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
8080
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <merging.mixed_ndims>`
81+
- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
82+
longer change type of the resulting index (:issue:`6440`)
83+
84+
.. ipython:: python
85+
86+
i = pd.Index([1, 2, 3, 'a' , 'b', 'c'])
87+
i[[0,1,2]]
88+
89+
Previously, the above operation would return ``Int64Index``. If you'd like
90+
to do this manually, use :meth:`Index.astype`
91+
92+
.. ipython:: python
93+
94+
i[[0,1,2]].astype(np.int_)
95+
8196

8297
MultiIndexing Using Slicers
8398
~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/core/index.py

+25-26
Original file line numberDiff line numberDiff line change
@@ -631,34 +631,35 @@ def __hash__(self):
631631
raise TypeError("unhashable type: %r" % type(self).__name__)
632632

633633
def __getitem__(self, key):
634-
"""Override numpy.ndarray's __getitem__ method to work as desired"""
635-
arr_idx = self.view(np.ndarray)
634+
"""
635+
Override numpy.ndarray's __getitem__ method to work as desired.
636+
637+
This function adds lists and Series as valid boolean indexers
638+
(ndarrays only supports ndarray with dtype=bool).
639+
640+
If resulting ndim != 1, plain ndarray is returned instead of
641+
corresponding `Index` subclass.
642+
643+
"""
644+
# There's no custom logic to be implemented in __getslice__, so it's
645+
# not overloaded intentionally.
646+
__getitem__ = super(Index, self).__getitem__
636647
if np.isscalar(key):
637-
return arr_idx[key]
638-
else:
639-
if com._is_bool_indexer(key):
640-
key = np.asarray(key)
648+
return __getitem__(key)
641649

642-
try:
643-
result = arr_idx[key]
644-
if result.ndim > 1:
645-
return result
646-
except (IndexError):
647-
if not len(key):
648-
result = []
649-
else:
650-
raise
650+
if isinstance(key, slice):
651+
# This case is separated from the conditional above to avoid
652+
# pessimization of basic indexing.
653+
return __getitem__(key)
651654

652-
return Index(result, name=self.name)
655+
if com._is_bool_indexer(key):
656+
return __getitem__(np.asarray(key))
653657

654-
def _getitem_slice(self, key):
655-
""" getitem for a bool/sliceable, fallback to standard getitem """
656-
try:
657-
arr_idx = self.view(np.ndarray)
658-
result = arr_idx[key]
659-
return self.__class__(result, name=self.name, fastpath=True)
660-
except:
661-
return self.__getitem__(key)
658+
result = __getitem__(key)
659+
if result.ndim > 1:
660+
return result.view(np.ndarray)
661+
else:
662+
return result
662663

663664
def append(self, other):
664665
"""
@@ -2800,8 +2801,6 @@ def __getitem__(self, key):
28002801

28012802
return result
28022803

2803-
_getitem_slice = __getitem__
2804-
28052804
def take(self, indexer, axis=None):
28062805
"""
28072806
Analogous to ndarray.take

pandas/core/internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False):
37373737
if raise_on_error:
37383738
_check_slice_bounds(slobj, self.index)
37393739
return self.__class__(self._block._slice(slobj),
3740-
self.index._getitem_slice(slobj), fastpath=True)
3740+
self.index[slobj], fastpath=True)
37413741

37423742
def set_axis(self, axis, value, maybe_rename=True, check_axis=True):
37433743
cur_axis, value = self._set_axis(axis, value, check_axis)

pandas/tests/test_index.py

+27
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,25 @@ def test_fancy(self):
323323
for i in sl:
324324
self.assertEqual(i, sl[sl.get_loc(i)])
325325

326+
def test_empty_fancy(self):
327+
empty_farr = np.array([], dtype=np.float_)
328+
empty_iarr = np.array([], dtype=np.int_)
329+
empty_barr = np.array([], dtype=np.bool_)
330+
331+
# pd.DatetimeIndex is excluded, because it overrides getitem and should
332+
# be tested separately.
333+
for idx in [self.strIndex, self.intIndex, self.floatIndex]:
334+
empty_idx = idx.__class__([])
335+
values = idx.values
336+
337+
self.assert_(idx[[]].identical(empty_idx))
338+
self.assert_(idx[empty_iarr].identical(empty_idx))
339+
self.assert_(idx[empty_barr].identical(empty_idx))
340+
341+
# np.ndarray only accepts ndarray of int & bool dtypes, so should
342+
# Index.
343+
self.assertRaises(IndexError, idx.__getitem__, empty_farr)
344+
326345
def test_getitem(self):
327346
arr = np.array(self.dateIndex)
328347
exp = self.dateIndex[5]
@@ -762,6 +781,14 @@ def test_join_self(self):
762781
joined = res.join(res, how=kind)
763782
self.assertIs(res, joined)
764783

784+
def test_indexing_doesnt_change_class(self):
785+
idx = Index([1, 2, 3, 'a', 'b', 'c'])
786+
787+
self.assert_(idx[1:3].identical(
788+
pd.Index([2, 3], dtype=np.object_)))
789+
self.assert_(idx[[0,1]].identical(
790+
pd.Index([1, 2], dtype=np.object_)))
791+
765792

766793
class TestFloat64Index(tm.TestCase):
767794
_multiprocess_can_split_ = True

pandas/tseries/index.py

-2
Original file line numberDiff line numberDiff line change
@@ -1406,8 +1406,6 @@ def __getitem__(self, key):
14061406

14071407
return self._simple_new(result, self.name, new_offset, self.tz)
14081408

1409-
_getitem_slice = __getitem__
1410-
14111409
# Try to run function on index first, and then on elements of index
14121410
# Especially important for group-by functionality
14131411
def map(self, f):

pandas/tseries/period.py

-2
Original file line numberDiff line numberDiff line change
@@ -1056,8 +1056,6 @@ def __getitem__(self, key):
10561056

10571057
return PeriodIndex(result, name=self.name, freq=self.freq)
10581058

1059-
_getitem_slice = __getitem__
1060-
10611059
def _format_with_header(self, header, **kwargs):
10621060
return header + self._format_native_types(**kwargs)
10631061

vb_suite/index_object.py

+13
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,16 @@
4646

4747
index_int64_intersection = Benchmark('left.intersection(right)', setup,
4848
start_date=datetime(2011, 1, 1))
49+
50+
#----------------------------------------------------------------------
51+
# string index slicing
52+
setup = common_setup + """
53+
idx = tm.makeStringIndex(1000000)
54+
55+
mask = np.arange(1000000) % 3 == 0
56+
series_mask = Series(mask)
57+
"""
58+
index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup)
59+
index_str_slice_indexer_even = Benchmark('idx[::2]', setup)
60+
index_str_boolean_indexer = Benchmark('idx[mask]', setup)
61+
index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)

0 commit comments

Comments
 (0)