Skip to content

Commit 673931d

Browse files
committed
Merge pull request #4283 from jreback/index_mem
BUG: Fixed non-unique indexing memory allocation issue with .ix/.loc (GH4280)
2 parents f445088 + 1b91f4f commit 673931d

File tree

4 files changed

+54
-4
lines changed

4 files changed

+54
-4
lines changed

doc/source/release.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ pandas 0.12
235235
names (:issue:`3873`)
236236
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
237237
``reindex`` for location-based taking
238-
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
238+
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
239+
- Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
239240

240241
- Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
241242
- Allow index name to be used in groupby for non MultiIndex (:issue:`4014`)

doc/source/v0.12.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,8 @@ Bug Fixes
437437
names (:issue:`3873`)
438438
- Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
439439
``reindex`` for location-based taking
440-
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem`` (:issue:`4246)
440+
- Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`)
441+
- Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`)
441442

442443
- ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`)
443444
- ``read_html`` now correctly skips tests (:issue:`3741`)

pandas/index.pyx

+16-2
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,19 @@ cdef class IndexEngine:
278278
dict d = {}
279279
object val
280280
int count = 0, count_missing = 0
281-
Py_ssize_t i, j, n, n_t
281+
Py_ssize_t i, j, n, n_t, n_alloc
282282

283283
self._ensure_mapping_populated()
284284
values = self._get_index_values()
285285
stargets = set(targets)
286286
n = len(values)
287287
n_t = len(targets)
288-
result = np.empty(n*n_t, dtype=np.int64)
288+
if n > 10000:
289+
n_alloc = 10000
290+
else:
291+
n_alloc = n
292+
293+
result = np.empty(n_alloc, dtype=np.int64)
289294
missing = np.empty(n_t, dtype=np.int64)
290295

291296
# form the set of the results (like ismember)
@@ -304,12 +309,21 @@ cdef class IndexEngine:
304309
# found
305310
if val in d:
306311
for j in d[val]:
312+
313+
# realloc if needed
314+
if count >= n_alloc:
315+
n_alloc += 10000
316+
result = np.resize(result, n_alloc)
317+
307318
result[count] = j
308319
count += 1
309320

310321
# value not found
311322
else:
312323

324+
if count >= n_alloc:
325+
n_alloc += 10000
326+
result = np.resize(result, n_alloc)
313327
result[count] = -1
314328
count += 1
315329
missing[count_missing] = i

pandas/tests/test_indexing.py

+34
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,40 @@ def test_mi_access(self):
11021102
result = df2['A']['B2']
11031103
assert_frame_equal(result,expected)
11041104

1105+
def test_non_unique_loc_memory_error(self):
1106+
1107+
# GH 4280
1108+
# non_unique index with a large selection triggers a memory error
1109+
1110+
columns = list('ABCDEFG')
1111+
def gen_test(l,l2):
1112+
return pd.concat([ DataFrame(randn(l,len(columns)),index=range(l),columns=columns),
1113+
DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ])
1114+
1115+
1116+
def gen_expected(df,mask):
1117+
l = len(mask)
1118+
return pd.concat([
1119+
df.take([0],convert=False),
1120+
DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns),
1121+
df.take(mask[1:],convert=False) ])
1122+
1123+
df = gen_test(900,100)
1124+
self.assert_(not df.index.is_unique)
1125+
1126+
mask = np.arange(100)
1127+
result = df.loc[mask]
1128+
expected = gen_expected(df,mask)
1129+
assert_frame_equal(result,expected)
1130+
1131+
df = gen_test(900000,100000)
1132+
self.assert_(not df.index.is_unique)
1133+
1134+
mask = np.arange(100000)
1135+
result = df.loc[mask]
1136+
expected = gen_expected(df,mask)
1137+
assert_frame_equal(result,expected)
1138+
11051139
if __name__ == '__main__':
11061140
import nose
11071141
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)