Skip to content

Commit 5033a4a

Browse files
authored
PERF: unnecessary materialization of a MultiIndex.values when introspecting memory (#14308)
closes #14308
1 parent c084bc1 commit 5033a4a

File tree

3 files changed

+43
-2
lines changed

3 files changed

+43
-2
lines changed

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1409,7 +1409,7 @@ Performance Improvements
14091409
- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`)
14101410
- Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`)
14111411
- Improved performance of ``groupby.groups`` (:issue:`14293`)
1412-
1412+
- Unecessary materializing of a MultiIndex when introspecting for memory usage (:issue:`14308`)
14131413

14141414
.. _whatsnew_0190.bug_fixes:
14151415

pandas/indexes/multi.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -413,10 +413,27 @@ def _shallow_copy(self, values=None, **kwargs):
413413
def dtype(self):
414414
return np.dtype('O')
415415

416+
@Appender(Index.memory_usage.__doc__)
417+
def memory_usage(self, deep=False):
418+
# we are overwriting our base class to avoid
419+
# computing .values here which could materialize
420+
# a tuple representation uncessarily
421+
return self._nbytes(deep)
422+
416423
@cache_readonly
417424
def nbytes(self):
418425
""" return the number of bytes in the underlying data """
419-
level_nbytes = sum((i.nbytes for i in self.levels))
426+
return self._nbytes(False)
427+
428+
def _nbytes(self, deep=False):
429+
"""
430+
return the number of bytes in the underlying data
431+
deeply introspect the level data if deep=True
432+
433+
*this is in internal routine*
434+
435+
"""
436+
level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
420437
label_nbytes = sum((i.nbytes for i in self.labels))
421438
names_nbytes = sum((getsizeof(i) for i in self.names))
422439
return level_nbytes + label_nbytes + names_nbytes

pandas/tests/frame/test_repr_info.py

+24
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,27 @@ def test_info_memory_usage(self):
381381
# deep=True, and add on some GC overhead
382382
diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
383383
self.assertTrue(abs(diff) < 100)
384+
385+
def test_info_memory_usage_bug_on_multiindex(self):
386+
# GH 14308
387+
# memory usage introspection should not materialize .values
388+
389+
from string import ascii_uppercase as uppercase
390+
391+
def memory_usage(f):
392+
return f.memory_usage(deep=True).sum()
393+
394+
N = 100
395+
M = len(uppercase)
396+
index = pd.MultiIndex.from_product([list(uppercase),
397+
pd.date_range('20160101',
398+
periods=N)],
399+
names=['id', 'date'])
400+
df = DataFrame({'value': np.random.randn(N * M)}, index=index)
401+
402+
unstacked = df.unstack('id')
403+
self.assertEqual(df.values.nbytes, unstacked.values.nbytes)
404+
self.assertTrue(memory_usage(df) > memory_usage(unstacked))
405+
406+
# high upper bound
407+
self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)

0 commit comments

Comments
 (0)