Skip to content

PERF: optimize index.__getitem__ for slice & boolean mask indexers #6440

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 28, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ API Changes
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`)
- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
longer change type of the resulting index (:issue:`6440`).

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down
15 changes: 15 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@ These are out-of-bounds selections
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <merging.mixed_ndims>`
- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
longer change type of the resulting index (:issue:`6440`)

.. ipython:: python

i = pd.Index([1, 2, 3, 'a' , 'b', 'c'])
i[[0,1,2]]

Previously, the above operation would return ``Int64Index``. If you'd like
to do this manually, use :meth:`Index.astype`

.. ipython:: python

i[[0,1,2]].astype(np.int_)


MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
51 changes: 25 additions & 26 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,34 +631,35 @@ def __hash__(self):
raise TypeError("unhashable type: %r" % type(self).__name__)

def __getitem__(self, key):
"""Override numpy.ndarray's __getitem__ method to work as desired"""
arr_idx = self.view(np.ndarray)
"""
Override numpy.ndarray's __getitem__ method to work as desired.

This function adds lists and Series as valid boolean indexers
(ndarrays only supports ndarray with dtype=bool).

If resulting ndim != 1, plain ndarray is returned instead of
corresponding `Index` subclass.

"""
# There's no custom logic to be implemented in __getslice__, so it's
# not overloaded intentionally.
__getitem__ = super(Index, self).__getitem__
if np.isscalar(key):
return arr_idx[key]
else:
if com._is_bool_indexer(key):
key = np.asarray(key)
return __getitem__(key)

try:
result = arr_idx[key]
if result.ndim > 1:
return result
except (IndexError):
if not len(key):
result = []
else:
raise
if isinstance(key, slice):
# This case is separated from the conditional above to avoid
# pessimization of basic indexing.
return __getitem__(key)

return Index(result, name=self.name)
if com._is_bool_indexer(key):
return __getitem__(np.asarray(key))

def _getitem_slice(self, key):
""" getitem for a bool/sliceable, fallback to standard getitem """
try:
arr_idx = self.view(np.ndarray)
result = arr_idx[key]
return self.__class__(result, name=self.name, fastpath=True)
except:
return self.__getitem__(key)
result = __getitem__(key)
if result.ndim > 1:
return result.view(np.ndarray)
else:
return result

def append(self, other):
"""
Expand Down Expand Up @@ -2800,8 +2801,6 @@ def __getitem__(self, key):

return result

_getitem_slice = __getitem__

def take(self, indexer, axis=None):
"""
Analogous to ndarray.take
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False):
if raise_on_error:
_check_slice_bounds(slobj, self.index)
return self.__class__(self._block._slice(slobj),
self.index._getitem_slice(slobj), fastpath=True)
self.index[slobj], fastpath=True)

def set_axis(self, axis, value, maybe_rename=True, check_axis=True):
cur_axis, value = self._set_axis(axis, value, check_axis)
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,25 @@ def test_fancy(self):
for i in sl:
self.assertEqual(i, sl[sl.get_loc(i)])

def test_empty_fancy(self):
empty_farr = np.array([], dtype=np.float_)
empty_iarr = np.array([], dtype=np.int_)
empty_barr = np.array([], dtype=np.bool_)

# pd.DatetimeIndex is excluded, because it overrides getitem and should
# be tested separately.
for idx in [self.strIndex, self.intIndex, self.floatIndex]:
empty_idx = idx.__class__([])
values = idx.values

self.assert_(idx[[]].identical(empty_idx))
self.assert_(idx[empty_iarr].identical(empty_idx))
self.assert_(idx[empty_barr].identical(empty_idx))

# np.ndarray only accepts ndarray of int & bool dtypes, so should
# Index.
self.assertRaises(IndexError, idx.__getitem__, empty_farr)

def test_getitem(self):
arr = np.array(self.dateIndex)
exp = self.dateIndex[5]
Expand Down Expand Up @@ -762,6 +781,14 @@ def test_join_self(self):
joined = res.join(res, how=kind)
self.assertIs(res, joined)

def test_indexing_doesnt_change_class(self):
idx = Index([1, 2, 3, 'a', 'b', 'c'])

self.assert_(idx[1:3].identical(
pd.Index([2, 3], dtype=np.object_)))
self.assert_(idx[[0,1]].identical(
pd.Index([1, 2], dtype=np.object_)))


class TestFloat64Index(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down
2 changes: 0 additions & 2 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1406,8 +1406,6 @@ def __getitem__(self, key):

return self._simple_new(result, self.name, new_offset, self.tz)

_getitem_slice = __getitem__

# Try to run function on index first, and then on elements of index
# Especially important for group-by functionality
def map(self, f):
Expand Down
2 changes: 0 additions & 2 deletions pandas/tseries/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,8 +1056,6 @@ def __getitem__(self, key):

return PeriodIndex(result, name=self.name, freq=self.freq)

_getitem_slice = __getitem__

def _format_with_header(self, header, **kwargs):
return header + self._format_native_types(**kwargs)

Expand Down
13 changes: 13 additions & 0 deletions vb_suite/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,16 @@

index_int64_intersection = Benchmark('left.intersection(right)', setup,
start_date=datetime(2011, 1, 1))

#----------------------------------------------------------------------
# string index slicing
setup = common_setup + """
idx = tm.makeStringIndex(1000000)

mask = np.arange(1000000) % 3 == 0
series_mask = Series(mask)
"""
index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup)
index_str_slice_indexer_even = Benchmark('idx[::2]', setup)
index_str_boolean_indexer = Benchmark('idx[mask]', setup)
index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)