Skip to content

Commit 3853fe6

Browse files
committed
PERF: correctly report memory used by Index's
Author: Jeff Reback <[email protected]> Closes #15237 from jreback/memory and squashes the following commits: d77c002 [Jeff Reback] PERF: correctly report memory used by Index's
1 parent c67486f commit 3853fe6

File tree

8 files changed

+106
-4
lines changed

8 files changed

+106
-4
lines changed

doc/source/whatsnew/v0.20.0.txt

+37
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,43 @@ New Behavior:
315315
In [5]: df['a']['2011-12-31 23:59:59']
316316
Out[5]: 1
317317

318+
.. _whatsnew_0200.api_breaking.memory_usage:
319+
320+
Memory Usage for Index is more Accurate
321+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
322+
323+
In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`)
324+
325+
Previous Behavior:
326+
327+
.. code-block:: ipython
328+
329+
In [8]: index = Index(['foo', 'bar', 'baz'])
330+
331+
In [9]: index.memory_usage(deep=True)
332+
Out[9]: 180
333+
334+
In [10]: index.get_loc('foo')
335+
Out[10]: 0
336+
337+
In [11]: index.memory_usage(deep=True)
338+
Out[11]: 180
339+
340+
New Behavior:
341+
342+
.. code-block:: ipython
343+
344+
In [8]: index = Index(['foo', 'bar', 'baz'])
345+
346+
In [9]: index.memory_usage(deep=True)
347+
Out[9]: 180
348+
349+
In [10]: index.get_loc('foo')
350+
Out[10]: 0
351+
352+
In [11]: index.memory_usage(deep=True)
353+
Out[11]: 260
354+
318355
.. _whatsnew_0200.api:
319356

320357
Other API Changes

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False):
10671067
v = self.values.nbytes
10681068
if deep and is_object_dtype(self):
10691069
v += lib.memory_usage_of_objects(self.values)
1070+
10701071
return v
10711072

10721073
def factorize(self, sort=False, na_sentinel=-1):

pandas/index.pyx

+9
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,15 @@ cdef class IndexEngine:
203203

204204
return result
205205

206+
def sizeof(self, deep=False):
207+
""" return the sizeof our mapping """
208+
if not self.is_mapping_populated:
209+
return 0
210+
return self.mapping.sizeof(deep=deep)
211+
212+
def __sizeof__(self):
213+
return self.sizeof()
214+
206215
property is_unique:
207216

208217
def __get__(self):

pandas/indexes/base.py

+8
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,14 @@ def get_values(self):
536536
""" return the underlying data as an ndarray """
537537
return self.values
538538

539+
@Appender(IndexOpsMixin.memory_usage.__doc__)
540+
def memory_usage(self, deep=False):
541+
result = super(Index, self).memory_usage(deep=deep)
542+
543+
# include our engine hashtable
544+
result += self._engine.sizeof(deep=deep)
545+
return result
546+
539547
# ops compat
540548
def tolist(self):
541549
"""

pandas/indexes/multi.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -446,13 +446,19 @@ def _nbytes(self, deep=False):
446446
return the number of bytes in the underlying data
447447
deeply introspect the level data if deep=True
448448
449+
include the engine hashtable
450+
449451
*this is in internal routine*
450452
451453
"""
452454
level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
453455
label_nbytes = sum((i.nbytes for i in self.labels))
454456
names_nbytes = sum((getsizeof(i) for i in self.names))
455-
return level_nbytes + label_nbytes + names_nbytes
457+
result = level_nbytes + label_nbytes + names_nbytes
458+
459+
# include our engine hashtable
460+
result += self._engine.sizeof(deep=deep)
461+
return result
456462

457463
def _format_attrs(self):
458464
"""

pandas/src/hashtable_class_helper.pxi.in

+19
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ cdef class ObjectVector:
203203

204204

205205
cdef class HashTable:
206+
206207
pass
207208

208209
{{py:
@@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable):
237238
k = kh_get_{{dtype}}(self.table, key)
238239
return k != self.table.n_buckets
239240

241+
def sizeof(self, deep=False):
242+
""" return the size of my table in bytes """
243+
return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
244+
sizeof(size_t) + # vals
245+
sizeof(uint32_t)) # flags
246+
240247
cpdef get_item(self, {{dtype}}_t val):
241248
cdef khiter_t k
242249
k = kh_get_{{dtype}}(self.table, val)
@@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable):
464471
kh_destroy_str(self.table)
465472
self.table = NULL
466473

474+
def sizeof(self, deep=False):
475+
""" return the size of my table in bytes """
476+
return self.table.n_buckets * (sizeof(char *) + # keys
477+
sizeof(size_t) + # vals
478+
sizeof(uint32_t)) # flags
479+
467480
cpdef get_item(self, object val):
468481
cdef:
469482
khiter_t k
@@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable):
714727
k = kh_get_pymap(self.table, <PyObject*>key)
715728
return k != self.table.n_buckets
716729

730+
def sizeof(self, deep=False):
731+
""" return the size of my table in bytes """
732+
return self.table.n_buckets * (sizeof(PyObject *) + # keys
733+
sizeof(size_t) + # vals
734+
sizeof(uint32_t)) # flags
735+
717736
cpdef get_item(self, object val):
718737
cdef khiter_t k
719738
if val != val or val is None:

pandas/tests/indexes/common.py

+20
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,26 @@ def test_compat(self):
366366
for ind in self.indices.values():
367367
self.assertEqual(ind.tolist(), list(ind))
368368

369+
def test_memory_usage(self):
370+
for name, index in compat.iteritems(self.indices):
371+
result = index.memory_usage()
372+
if len(index):
373+
index.get_loc(index[0])
374+
result2 = index.memory_usage()
375+
result3 = index.memory_usage(deep=True)
376+
377+
# RangeIndex doesn't use a hashtable engine
378+
if not isinstance(index, RangeIndex):
379+
self.assertTrue(result2 > result)
380+
381+
if index.inferred_type == 'object':
382+
self.assertTrue(result3 > result2)
383+
384+
else:
385+
386+
# we report 0 for no-length
387+
self.assertEqual(result, 0)
388+
369389
def test_argsort(self):
370390
for k, ind in self.indices.items():
371391

pandas/tests/test_categorical.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1555,11 +1555,13 @@ def test_nbytes(self):
15551555

15561556
def test_memory_usage(self):
15571557
cat = pd.Categorical([1, 2, 3])
1558-
self.assertEqual(cat.nbytes, cat.memory_usage())
1559-
self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))
1558+
1559+
# .categories is an index, so we include the hashtable
1560+
self.assertTrue(cat.nbytes > 0 and cat.nbytes <= cat.memory_usage())
1561+
self.assertTrue(cat.nbytes > 0 and
1562+
cat.nbytes <= cat.memory_usage(deep=True))
15601563

15611564
cat = pd.Categorical(['foo', 'foo', 'bar'])
1562-
self.assertEqual(cat.nbytes, cat.memory_usage())
15631565
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)
15641566

15651567
# sys.getsizeof will call the .memory_usage with

0 commit comments

Comments
 (0)