From 0890d070e341df9f5d54c95fca6ec0da168e9fb7 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Mon, 24 Jun 2013 20:11:35 -0400
Subject: [PATCH] BUG: GH4017, efficiently support non-unique indicies with
 iloc

PERF: getting an indexer with a non_unique index, now MUCH faster

PERF: vbench for loc/iloc with dups

BUG: sparse reindex needed takeable arg

TST

BUG: correctly interpret tuple/list in non_unique indexers

BUG: df.loc[idx] with out-of-bounds indexers not correctly interpreted

PERF: df.loc with non-unique index not blazing fast!
---
 doc/source/release.rst        |  2 ++
 doc/source/v0.12.0.txt        |  2 ++
 pandas/core/frame.py          | 21 +++++++++++++--------
 pandas/core/index.py          | 12 +++++++++---
 pandas/core/indexing.py       | 17 ++++++++++++++---
 pandas/core/series.py         |  8 +++++---
 pandas/index.pyx              | 33 ++++++++++++++++++++++-----------
 pandas/sparse/frame.py        |  5 +++--
 pandas/tests/test_indexing.py | 32 +++++++++++++++++++++++++++++++-
 vb_suite/indexing.py          | 16 ++++++++++++++++
 10 files changed, 117 insertions(+), 31 deletions(-)

diff --git a/doc/source/release.rst b/doc/source/release.rst
index 917d91a14441e..817d029417aa1 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -212,6 +212,8 @@ pandas 0.12
     - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`)
     - ``DataFrame.itertuples()`` now works with frames with duplicate column
       names (:issue:`3873`)
+    - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to 
+      ``reindex`` for location-based taking
 
   - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
   - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)
diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
index e146e892722d8..ccdba4d765aff 100644
--- a/doc/source/v0.12.0.txt
+++ b/doc/source/v0.12.0.txt
@@ -410,6 +410,8 @@ Bug Fixes
     - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`)
     - ``DataFrame.itertuples()`` now works with frames with duplicate column
       names (:issue:`3873`)
+    - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to 
+      ``reindex`` for location-based taking
 
   - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
   - ``read_html`` now correctly skips tests (:issue:`3741`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 3a23a212c51e8..5f1ea00e421a8 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1979,7 +1979,9 @@ def _ixs(self, i, axis=0, copy=False):
             else:
                 label = self.index[i]
                 if isinstance(label, Index):
-                    return self.reindex(label)
+
+                    # a location index by definition
+                    return self.reindex(label, takeable=True)
                 else:
                     try:
                         new_values = self._data.fast_2d_xs(i, copy=copy)
@@ -2590,7 +2592,7 @@ def _align_series(self, other, join='outer', axis=None, level=None,
             return left_result, right_result
 
     def reindex(self, index=None, columns=None, method=None, level=None,
-                fill_value=NA, limit=None, copy=True):
+                fill_value=NA, limit=None, copy=True, takeable=False):
         """Conform DataFrame to new index with optional filling logic, placing
         NA/NaN in locations having no value in the previous index. A new object
         is produced unless the new index is equivalent to the current one and
@@ -2617,6 +2619,7 @@ def reindex(self, index=None, columns=None, method=None, level=None,
             "compatible" value
         limit : int, default None
             Maximum size gap to forward or backward fill
+        takeable : the labels are locations (and not labels)
 
         Examples
         --------
@@ -2636,11 +2639,11 @@ def reindex(self, index=None, columns=None, method=None, level=None,
 
         if columns is not None:
             frame = frame._reindex_columns(columns, copy, level,
-                                           fill_value, limit)
+                                           fill_value, limit, takeable)
 
         if index is not None:
             frame = frame._reindex_index(index, method, copy, level,
-                                         fill_value, limit)
+                                         fill_value, limit, takeable)
 
         return frame
 
@@ -2717,16 +2720,18 @@ def _reindex_multi(self, new_index, new_columns, copy, fill_value):
             return self.copy() if copy else self
 
     def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
-                       limit=None):
+                       limit=None, takeable=False):
         new_index, indexer = self.index.reindex(new_index, method, level,
-                                                limit=limit, copy_if_needed=True)
+                                                limit=limit, copy_if_needed=True,
+                                                takeable=takeable)
         return self._reindex_with_indexers(new_index, indexer, None, None,
                                            copy, fill_value)
 
     def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
-                         limit=None):
+                         limit=None, takeable=False):
         new_columns, indexer = self.columns.reindex(new_columns, level=level,
-                                                    limit=limit, copy_if_needed=True)
+                                                    limit=limit, copy_if_needed=True,
+                                                    takeable=takeable)
         return self._reindex_with_indexers(None, None, new_columns, indexer,
                                            copy, fill_value)
 
diff --git a/pandas/core/index.py b/pandas/core/index.py
index c06c46cde36c8..05cb360d12e16 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -920,7 +920,8 @@ def _get_method(self, method):
         }
         return aliases.get(method, method)
 
-    def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False):
+    def reindex(self, target, method=None, level=None, limit=None,
+                copy_if_needed=False, takeable=False):
         """
         For Index, simply returns the new index and the results of
         get_indexer. Provided here to enable an interface that is amenable for
@@ -953,7 +954,11 @@ def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=Fa
                     if method is not None or limit is not None:
                         raise ValueError("cannot reindex a non-unique index "
                                          "with a method or limit")
-                    indexer, missing = self.get_indexer_non_unique(target)
+                    if takeable:
+                        indexer = target
+                        missing = (target>=len(target)).nonzero()[0]
+                    else:
+                        indexer, missing = self.get_indexer_non_unique(target)
 
         return target, indexer
 
@@ -2202,7 +2207,8 @@ def get_indexer(self, target, method=None, limit=None):
 
         return com._ensure_platform_int(indexer)
 
-    def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False):
+    def reindex(self, target, method=None, level=None, limit=None,
+                copy_if_needed=False, takeable=False):
         """
         Performs any necessary conversion on the input index and calls
         get_indexer. This method is here so MultiIndex and an Index of
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 33f72a0d15415..27c12fcd2e8eb 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -476,10 +476,21 @@ def _reindex(keys, level=None):
                     cur_indexer = com._ensure_int64(l[check])
 
                     new_labels = np.empty(tuple([len(indexer)]),dtype=object)
-                    new_labels[cur_indexer] = cur_labels
-                    new_labels[missing_indexer]    = missing_labels
+                    new_labels[cur_indexer]     = cur_labels
+                    new_labels[missing_indexer] = missing_labels
+                    new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values
+                    new_indexer[missing_indexer] = -1
 
-                    result = result.reindex_axis(new_labels,axis=axis)
+                    # need to reindex with an indexer on a specific axis
+                    from pandas.core.frame import DataFrame
+                    if not (type(self.obj) == DataFrame):
+                        raise NotImplementedError("cannot handle non-unique indexing for non-DataFrame (yet)")
+
+                    args = [None] * 4
+                    args[2*axis] = new_labels
+                    args[2*axis+1] = new_indexer
+
+                    result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan)
 
                 return result
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
index bd76d91629121..9b11f7c7b0f66 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -598,7 +598,7 @@ def _ixs(self, i, axis=0):
             else:
                 label = self.index[i]
                 if isinstance(label, Index):
-                    return self.reindex(label)
+                    return self.reindex(label, takeable=True)
                 else:
                     return _index.get_value_at(self, i)
 
@@ -2618,7 +2618,7 @@ def _reindex_indexer(self, new_index, indexer, copy):
         return self._constructor(new_values, new_index, name=self.name)
 
     def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
-                limit=None, copy=True):
+                limit=None, copy=True, takeable=False):
         """Conform Series to new index with optional filling logic, placing
         NA/NaN in locations having no value in the previous index. A new object
         is produced unless the new index is equivalent to the current one and
@@ -2643,6 +2643,7 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
             "compatible" value
         limit : int, default None
             Maximum size gap to forward or backward fill
+        takeable : the labels are locations (and not labels)
 
         Returns
         -------
@@ -2664,7 +2665,8 @@ def reindex(self, index=None, method=None, level=None, fill_value=pa.NA,
             return Series(nan, index=index, name=self.name)
 
         new_index, indexer = self.index.reindex(index, method=method,
-                                                level=level, limit=limit)
+                                                level=level, limit=limit,
+                                                takeable=takeable)
         new_values = com.take_1d(self.values, indexer, fill_value=fill_value)
         return Series(new_values, index=new_index, name=self.name)
 
diff --git a/pandas/index.pyx b/pandas/index.pyx
index 85a83b745510f..ac2638b62977c 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -272,33 +272,44 @@ cdef class IndexEngine:
             to the -1 indicies in the results """
 
         cdef:
-            ndarray values
+            ndarray values, x
             ndarray[int64_t] result, missing
-            object v, val
+            set stargets
+            dict d = {}
+            object val
             int count = 0, count_missing = 0
-            Py_ssize_t i, j, n, found
+            Py_ssize_t i, j, n, n_t
 
         self._ensure_mapping_populated()
         values = self._get_index_values()
+        stargets = set(targets)
         n = len(values)
         n_t = len(targets)
-        result  = np.empty(n+n_t, dtype=np.int64)
+        result  = np.empty(n*n_t, dtype=np.int64)
         missing = np.empty(n_t, dtype=np.int64)
 
+        # form the set of the results (like ismember)
+        members = np.empty(n, dtype=np.uint8)
+        for i in range(n):
+            val = util.get_value_1d(values, i)
+            if val in stargets:
+                if val not in d:
+                   d[val] = []
+                d[val].append(i)
+
         for i in range(n_t):
-            val = util.get_value_at(targets, i)
-            found = 0
 
-            for j in range(n):
-                v = util.get_value_at(values, j)
+            val = util.get_value_1d(targets, i)
 
-                if v == val:
+            # found
+            if val in d:
+                for j in d[val]:
                    result[count] = j
                    count += 1
-                   found = 1
 
             # value not found
-            if found == 0:
+            else:
+
                 result[count] = -1
                 count += 1
                 missing[count_missing] = i
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
index 0a08fba49afeb..f5e57efdcb166 100644
--- a/pandas/sparse/frame.py
+++ b/pandas/sparse/frame.py
@@ -583,7 +583,7 @@ def _combine_const(self, other, func):
                                  columns=self.columns)
 
     def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
-                       limit=None):
+                       limit=None, takeable=False):
         if level is not None:
             raise TypeError('Reindex by level not supported for sparse')
 
@@ -614,7 +614,8 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
         return SparseDataFrame(new_series, index=index, columns=self.columns,
                                default_fill_value=self.default_fill_value)
 
-    def _reindex_columns(self, columns, copy, level, fill_value, limit=None):
+    def _reindex_columns(self, columns, copy, level, fill_value, limit=None,
+                         takeable=False):
         if level is not None:
             raise TypeError('Reindex by level not supported for sparse')
 
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
index 0719d9c9a87db..8b6bf1ed7f651 100644
--- a/pandas/tests/test_indexing.py
+++ b/pandas/tests/test_indexing.py
@@ -8,7 +8,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
-import pandas as pan
+import pandas as pd
 import pandas.core.common as com
 from pandas.core.api import (DataFrame, Index, Series, Panel, notnull, isnull,
                              MultiIndex, DatetimeIndex, Timestamp)
@@ -1037,6 +1037,36 @@ def test_loc_name(self):
         result = df.loc[[0, 1]].index.name
         self.assert_(result == 'index_name')
 
+    def test_iloc_non_unique_indexing(self):
+
+        #GH 4017, non-unique indexing (on the axis)
+        df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000})
+        idx = np.array(range(30)) * 99
+        expected = df.iloc[idx]
+
+        df3 = pd.concat([df, 2*df, 3*df])
+        result = df3.iloc[idx]
+
+        assert_frame_equal(result, expected)
+
+        df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000})
+        df2 = pd.concat([df2, 2*df2, 3*df2])
+
+        sidx = df2.index.to_series()
+        expected = df2.iloc[idx[idx<=sidx.max()]]
+
+        new_list = []
+        for r, s in expected.iterrows():
+            new_list.append(s)
+            new_list.append(s*2)
+            new_list.append(s*3)
+
+        expected = DataFrame(new_list)
+        expected = pd.concat([ expected, DataFrame(index=idx[idx>sidx.max()]) ])
+        result = df2.loc[idx]
+        assert_frame_equal(result, expected)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py
index 7847a880918dc..9f07cc6ed15c3 100644
--- a/vb_suite/indexing.py
+++ b/vb_suite/indexing.py
@@ -148,3 +148,19 @@
 
 indexing_panel_subset = Benchmark('p.ix[inds, inds, inds]', setup,
                                   start_date=datetime(2012, 1, 1))
+
+#----------------------------------------------------------------------
+# Iloc
+
+setup = common_setup + """
+df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000})
+idx = np.array(range(30)) * 99
+df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000})
+df2 = concat([df2, 2*df2, 3*df2])
+"""
+
+frame_iloc_dups = Benchmark('df2.iloc[idx]', setup,
+                            start_date=datetime(2013, 1, 1))
+
+frame_loc_dups = Benchmark('df2.loc[idx]', setup,
+                            start_date=datetime(2013, 1, 1))