Skip to content

Commit 36bda21

Browse files
committed
PERF: correctly report memory used by Index's
1 parent 2619ee3 commit 36bda21

File tree

7 files changed

+95
-1
lines changed

7 files changed

+95
-1
lines changed

doc/source/whatsnew/v0.20.0.txt

+37
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,43 @@ New Behavior:
314314
In [5]: df['a']['2011-12-31 23:59:59']
315315
Out[5]: 1
316316

317+
.. _whatsnew_0200.api_breaking.memory_usage:
318+
319+
Memory Usage for Index now is now more accurate
320+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
321+
322+
In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types.
323+
324+
Previous Behavior:
325+
326+
.. code-block:: ipython
327+
328+
In [8]: index = Index(['foo', 'bar', 'baz'])
329+
330+
In [9]: index.memory_usage(deep=True)
331+
Out[9]: 180
332+
333+
In [10]: index.get_loc('foo')
334+
Out[10]: 0
335+
336+
In [11]: index.memory_usage(deep=True)
337+
Out[11]: 180
338+
339+
New Behavior:
340+
341+
.. code-block:: ipython
342+
343+
In [8]: index = Index(['foo', 'bar', 'baz'])
344+
345+
In [9]: index.memory_usage(deep=True)
346+
Out[9]: 180
347+
348+
In [10]: index.get_loc('foo')
349+
Out[10]: 0
350+
351+
In [11]: index.memory_usage(deep=True)
352+
Out[11]: 276
353+
317354
.. _whatsnew_0200.api:
318355

319356
Other API Changes

pandas/core/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False):
10671067
v = self.values.nbytes
10681068
if deep and is_object_dtype(self):
10691069
v += lib.memory_usage_of_objects(self.values)
1070+
10701071
return v
10711072

10721073
def factorize(self, sort=False, na_sentinel=-1):

pandas/index.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,12 @@ cdef class IndexEngine:
203203

204204
return result
205205

206+
def sizeof(self, deep=False):
207+
""" return the sizeof our mapping """
208+
if not self.is_mapping_populated:
209+
return 0
210+
return self.mapping.sizeof(deep=deep)
211+
206212
property is_unique:
207213

208214
def __get__(self):

pandas/indexes/base.py

+8
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,14 @@ def get_values(self):
536536
""" return the underlying data as an ndarray """
537537
return self.values
538538

539+
@Appender(IndexOpsMixin.memory_usage.__doc__)
540+
def memory_usage(self, deep=False):
541+
result = super(Index, self).memory_usage(deep=deep)
542+
543+
# include our engine hashtable
544+
result += self._engine.sizeof(deep=deep)
545+
return result
546+
539547
# ops compat
540548
def tolist(self):
541549
"""

pandas/indexes/multi.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -446,13 +446,19 @@ def _nbytes(self, deep=False):
446446
return the number of bytes in the underlying data
447447
deeply introspect the level data if deep=True
448448
449+
include the engine hashtable
450+
449451
*this is in internal routine*
450452
451453
"""
452454
level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
453455
label_nbytes = sum((i.nbytes for i in self.labels))
454456
names_nbytes = sum((getsizeof(i) for i in self.names))
455-
return level_nbytes + label_nbytes + names_nbytes
457+
result = level_nbytes + label_nbytes + names_nbytes
458+
459+
# include our engine hashtable
460+
result += self._engine.sizeof(deep=deep)
461+
return result
456462

457463
def _format_attrs(self):
458464
"""

pandas/src/hashtable_class_helper.pxi.in

+19
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ cdef class ObjectVector:
203203

204204

205205
cdef class HashTable:
206+
206207
pass
207208

208209
{{py:
@@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable):
237238
k = kh_get_{{dtype}}(self.table, key)
238239
return k != self.table.n_buckets
239240

241+
def sizeof(self, deep=False):
242+
""" return the size of my table in bytes """
243+
return self.table.n_buckets * (sizeof(self.table.keys) +
244+
sizeof(self.table.vals) +
245+
sizeof(self.table.flags))
246+
240247
cpdef get_item(self, {{dtype}}_t val):
241248
cdef khiter_t k
242249
k = kh_get_{{dtype}}(self.table, val)
@@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable):
464471
kh_destroy_str(self.table)
465472
self.table = NULL
466473

474+
def sizeof(self, deep=False):
475+
""" return the size of my table in bytes """
476+
return self.table.n_buckets * (sizeof(self.table.keys) +
477+
sizeof(self.table.vals) +
478+
sizeof(self.table.flags))
479+
467480
cpdef get_item(self, object val):
468481
cdef:
469482
khiter_t k
@@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable):
714727
k = kh_get_pymap(self.table, <PyObject*>key)
715728
return k != self.table.n_buckets
716729

730+
def sizeof(self, deep=False):
731+
""" return the size of my table in bytes """
732+
return self.table.n_buckets * (sizeof(self.table.keys) +
733+
sizeof(self.table.vals) +
734+
sizeof(self.table.flags))
735+
717736
cpdef get_item(self, object val):
718737
cdef khiter_t k
719738
if val != val or val is None:

pandas/tests/indexes/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,23 @@ def test_compat(self):
366366
for ind in self.indices.values():
367367
self.assertEqual(ind.tolist(), list(ind))
368368

369+
def test_memory_usage(self):
370+
for name, index in compat.iteritems(self.indices):
371+
result = index.memory_usage()
372+
if len(index):
373+
index.get_loc(index[0])
374+
result2 = index.memory_usage()
375+
result3 = index.memory_usage(deep=True)
376+
self.assertTrue(result2 > result)
377+
378+
if index.inferred_type == 'object':
379+
self.assertTrue(result3 > result2)
380+
381+
else:
382+
383+
# we report 0 for no-length
384+
self.assertEqual(result, 0)
385+
369386
def test_argsort(self):
370387
for k, ind in self.indices.items():
371388

0 commit comments

Comments
 (0)