PERF: optimize index.__getitem__ for slice & boolean mask indexers

immerrr · immerrr · commit 9788ad19db03 · 2014-02-28T18:03:39.000+04:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -105,6 +105,8 @@ API Changes
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
 - ``concat`` will now concatenate mixed Series and DataFrames using the Series name
   or numbering columns as needed (:issue:`2385`)
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+  longer change type of the resulting index (:issue:`6440`).
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -78,6 +78,21 @@ These are out-of-bounds selections
 - ``NameResolutionError`` was removed because it isn't necessary anymore.
 - ``concat`` will now concatenate mixed Series and DataFrames using the Series name
   or numbering columns as needed (:issue:`2385`). See :ref:`the docs <merging.mixed_ndims>`
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+  longer change type of the resulting index (:issue:`6440`)
+
+  .. ipython:: python
+
+     i = pd.Index([1, 2, 3, 'a' , 'b', 'c'])
+     i[[0,1,2]]
+
+  Previously, the above operation would return ``Int64Index``.  If you'd like
+  to do this manually, use :meth:`Index.astype`
+
+  .. ipython:: python
+
+     i[[0,1,2]].astype(np.int_)
+
 
 MultiIndexing Using Slicers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -631,34 +631,35 @@ def __hash__(self):
         raise TypeError("unhashable type: %r" % type(self).__name__)
 
     def __getitem__(self, key):
-        """Override numpy.ndarray's __getitem__ method to work as desired"""
-        arr_idx = self.view(np.ndarray)
+        """
+        Override numpy.ndarray's __getitem__ method to work as desired.
+
+        This function adds lists and Series as valid boolean indexers
+        (ndarrays only supports ndarray with dtype=bool).
+
+        If resulting ndim != 1, plain ndarray is returned instead of
+        corresponding `Index` subclass.
+
+        """
+        # There's no custom logic to be implemented in __getslice__, so it's
+        # not overloaded intentionally.
+        __getitem__ = super(Index, self).__getitem__
         if np.isscalar(key):
-            return arr_idx[key]
-        else:
-            if com._is_bool_indexer(key):
-                key = np.asarray(key)
+            return __getitem__(key)
 
-            try:
-                result = arr_idx[key]
-                if result.ndim > 1:
-                    return result
-            except (IndexError):
-                if not len(key):
-                    result = []
-                else:
-                    raise
+        if isinstance(key, slice):
+            # This case is separated from the conditional above to avoid
+            # pessimization of basic indexing.
+            return __getitem__(key)
 
-            return Index(result, name=self.name)
+        if com._is_bool_indexer(key):
+            return __getitem__(np.asarray(key))
 
-    def _getitem_slice(self, key):
-        """ getitem for a bool/sliceable, fallback to standard getitem """
-        try:
-            arr_idx = self.view(np.ndarray)
-            result = arr_idx[key]
-            return self.__class__(result, name=self.name, fastpath=True)
-        except:
-            return self.__getitem__(key)
+        result = __getitem__(key)
+        if result.ndim > 1:
+            return result.view(np.ndarray)
+        else:
+            return result
 
     def append(self, other):
         """
@@ -2800,8 +2801,6 @@ def __getitem__(self, key):
 
             return result
 
-    _getitem_slice = __getitem__
-
     def take(self, indexer, axis=None):
         """
         Analogous to ndarray.take
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False):
         if raise_on_error:
             _check_slice_bounds(slobj, self.index)
         return self.__class__(self._block._slice(slobj),
-                              self.index._getitem_slice(slobj), fastpath=True)
+                              self.index[slobj], fastpath=True)
 
     def set_axis(self, axis, value, maybe_rename=True, check_axis=True):
         cur_axis, value = self._set_axis(axis, value, check_axis)
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -323,6 +323,25 @@ def test_fancy(self):
         for i in sl:
             self.assertEqual(i, sl[sl.get_loc(i)])
 
+    def test_empty_fancy(self):
+        empty_farr = np.array([], dtype=np.float_)
+        empty_iarr = np.array([], dtype=np.int_)
+        empty_barr = np.array([], dtype=np.bool_)
+
+        # pd.DatetimeIndex is excluded, because it overrides getitem and should
+        # be tested separately.
+        for idx in [self.strIndex, self.intIndex, self.floatIndex]:
+            empty_idx = idx.__class__([])
+            values = idx.values
+
+            self.assert_(idx[[]].identical(empty_idx))
+            self.assert_(idx[empty_iarr].identical(empty_idx))
+            self.assert_(idx[empty_barr].identical(empty_idx))
+
+            # np.ndarray only accepts ndarray of int & bool dtypes, so should
+            # Index.
+            self.assertRaises(IndexError, idx.__getitem__, empty_farr)
+
     def test_getitem(self):
         arr = np.array(self.dateIndex)
         exp = self.dateIndex[5]
@@ -762,6 +781,14 @@ def test_join_self(self):
                 joined = res.join(res, how=kind)
                 self.assertIs(res, joined)
 
+    def test_indexing_doesnt_change_class(self):
+        idx = Index([1, 2, 3, 'a', 'b', 'c'])
+
+        self.assert_(idx[1:3].identical(
+            pd.Index([2, 3], dtype=np.object_)))
+        self.assert_(idx[[0,1]].identical(
+            pd.Index([1, 2], dtype=np.object_)))
+
 
 class TestFloat64Index(tm.TestCase):
     _multiprocess_can_split_ = True
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -1406,8 +1406,6 @@ def __getitem__(self, key):
 
             return self._simple_new(result, self.name, new_offset, self.tz)
 
-    _getitem_slice = __getitem__
-
     # Try to run function on index first, and then on elements of index
     # Especially important for group-by functionality
     def map(self, f):
diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py
@@ -1056,8 +1056,6 @@ def __getitem__(self, key):
 
             return PeriodIndex(result, name=self.name, freq=self.freq)
 
-    _getitem_slice = __getitem__
-
     def _format_with_header(self, header, **kwargs):
         return header + self._format_native_types(**kwargs)
 
diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py
@@ -46,3 +46,16 @@
 
 index_int64_intersection = Benchmark('left.intersection(right)', setup,
                                      start_date=datetime(2011, 1, 1))
+
+#----------------------------------------------------------------------
+# string index slicing
+setup = common_setup + """
+idx = tm.makeStringIndex(1000000)
+
+mask = np.arange(1000000) % 3 == 0
+series_mask = Series(mask)
+"""
+index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup)
+index_str_slice_indexer_even = Benchmark('idx[::2]', setup)
+index_str_boolean_indexer = Benchmark('idx[mask]', setup)
+index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)