diff --git a/doc/source/release.rst b/doc/source/release.rst index f3029cfe41349..b5871643f4c1d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -235,7 +235,8 @@ pandas 0.12 names (:issue:`3873`) - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to ``reindex`` for location-based taking - - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246) + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`) + - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`) - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`) - Allow index name to be used in groupby for non MultiIndex (:issue:`4014`) diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index 76fd81b882e84..c956d9ff3458f 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -437,7 +437,8 @@ Bug Fixes names (:issue:`3873`) - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to ``reindex`` for location-based taking - - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246) + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`) + - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`) - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`) - ``read_html`` now correctly skips tests (:issue:`3741`) diff --git a/pandas/index.pyx b/pandas/index.pyx index ac2638b62977c..2311ac25293f1 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -278,14 +278,19 @@ cdef class IndexEngine: dict d = {} object val int count = 0, count_missing = 0 - Py_ssize_t i, j, n, n_t + Py_ssize_t i, j, n, n_t, n_alloc self._ensure_mapping_populated() values = self._get_index_values() stargets = set(targets) n = len(values) n_t = len(targets) - result = np.empty(n*n_t, dtype=np.int64) + if n > 10000: + n_alloc = 10000 + else: + n_alloc = n + + result = np.empty(n_alloc, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) # form the set of the results (like ismember) @@ -304,12 +309,21 @@ cdef class IndexEngine: # found if val in d: for j in d[val]: + + # realloc if needed + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) + result[count] = j count += 1 # value not found else: + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index a4174c236c26a..f0ace52f2c2b5 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1102,6 +1102,40 @@ def test_mi_access(self): result = df2['A']['B2'] assert_frame_equal(result,expected) + def test_non_unique_loc_memory_error(self): + + # GH 4280 + # non_unique index with a large selection triggers a memory error + + columns = list('ABCDEFG') + def gen_test(l,l2): + return pd.concat([ DataFrame(randn(l,len(columns)),index=range(l),columns=columns), + DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ]) + + + def gen_expected(df,mask): + l = len(mask) + return pd.concat([ + df.take([0],convert=False), + DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns), + df.take(mask[1:],convert=False) ]) + + df = gen_test(900,100) + self.assert_(not df.index.is_unique) + + mask = np.arange(100) + result = df.loc[mask] + expected = gen_expected(df,mask) + assert_frame_equal(result,expected) + + df = gen_test(900000,100000) + self.assert_(not df.index.is_unique) + + mask = np.arange(100000) + result = df.loc[mask] + expected = gen_expected(df,mask) + assert_frame_equal(result,expected) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],