pandas-dev · jreback · Jul 18, 2013 · Jul 18, 2013
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -235,7 +235,8 @@ pandas 0.12
       names (:issue:`3873`)
     - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
       ``reindex`` for location-based taking
-    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
+    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
+    - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
 
   - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
   - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)

diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
@@ -437,7 +437,8 @@ Bug Fixes
       names (:issue:`3873`)
     - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
       ``reindex`` for location-based taking
-    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
+    - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
+    - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
 
   - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
   - ``read_html`` now correctly skips tests (:issue:`3741`)

diff --git a/pandas/index.pyx b/pandas/index.pyx
@@ -278,14 +278,19 @@ cdef class IndexEngine:
             dict d = {}
             object val
             int count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t
+            Py_ssize_t i, j, n, n_t, n_alloc
 
         self._ensure_mapping_populated()
         values = self._get_index_values()
         stargets = set(targets)
         n = len(values)
         n_t = len(targets)
-        result  = np.empty(n*n_t, dtype=np.int64)
+        if n > 10000:
+            n_alloc = 10000
+        else:
+            n_alloc = n
+
+        result  = np.empty(n_alloc, dtype=np.int64)
         missing = np.empty(n_t, dtype=np.int64)
 
         # form the set of the results (like ismember)
@@ -304,12 +309,21 @@ cdef class IndexEngine:
             # found
             if val in d:
                 for j in d[val]:
+
+                   # realloc if needed
+                   if count >= n_alloc:
+                      n_alloc += 10000
+                      result = np.resize(result, n_alloc)
+
                    result[count] = j
                    count += 1
 
             # value not found
             else:
 
+                if count >= n_alloc:
+                     n_alloc += 10000
+                     result = np.resize(result, n_alloc)
                 result[count] = -1
                 count += 1
                 missing[count_missing] = i

diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -1102,6 +1102,40 @@ def test_mi_access(self):
         result = df2['A']['B2']
         assert_frame_equal(result,expected)
 
+    def test_non_unique_loc_memory_error(self):
+
+        # GH 4280
+        # non_unique index with a large selection triggers a memory error
+
+        columns = list('ABCDEFG')
+        def gen_test(l,l2):
+            return pd.concat([ DataFrame(randn(l,len(columns)),index=range(l),columns=columns),
+                               DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ])
+
+
+        def gen_expected(df,mask):
+            l = len(mask)
+            return pd.concat([
+                df.take([0],convert=False),
+                DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns),
+                df.take(mask[1:],convert=False) ])
+
+        df = gen_test(900,100)
+        self.assert_(not df.index.is_unique)
+
+        mask = np.arange(100)
+        result = df.loc[mask]
+        expected = gen_expected(df,mask)
+        assert_frame_equal(result,expected)
+
+        df = gen_test(900000,100000)
+        self.assert_(not df.index.is_unique)
+
+        mask = np.arange(100000)
+        result = df.loc[mask]
+        expected = gen_expected(df,mask)
+        assert_frame_equal(result,expected)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],