Skip to content

Commit ddd0372

Browse files
committed
Merge pull request #11596 from jreback/memory
PERF/DOC: Option to .info() and .memory_usage() to provide for deep introspection of memory consumption #11595
2 parents dbf8885 + 89cad6b commit ddd0372

File tree

12 files changed

+196
-17
lines changed

12 files changed

+196
-17
lines changed

doc/source/api.rst

+3
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ Attributes
284284
Series.itemsize
285285
Series.base
286286
Series.T
287+
Series.memory_usage
287288

288289
Conversion
289290
~~~~~~~~~~
@@ -772,6 +773,7 @@ Attributes and underlying data
772773
DataFrame.ndim
773774
DataFrame.size
774775
DataFrame.shape
776+
DataFrame.memory_usage
775777

776778
Conversion
777779
~~~~~~~~~~
@@ -1333,6 +1335,7 @@ Attributes
13331335
Index.itemsize
13341336
Index.base
13351337
Index.T
1338+
Index.memory_usage
13361339

13371340
Modifying and Computations
13381341
~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/faq.rst

+10
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,16 @@ The ``+`` symbol indicates that the true memory usage could be higher, because
5050
pandas does not count the memory used by values in columns with
5151
``dtype=object``.
5252

53+
.. versionadded:: 0.17.1
54+
55+
Passing ``memory_usage='deep'`` will enable a more accurate memory usage report,
56+
that accounts for the full usage of the contained objects. This is optional
57+
as it can be expensive to do this deeper introspection.
58+
59+
.. ipython:: python
60+
61+
df.info(memory_usage='deep')
62+
5363
By default the display option is set to ``True`` but can be explicitly
5464
overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.
5565

doc/source/whatsnew/v0.17.1.txt

+13
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,19 @@ Enhancements
2727
- Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)
2828
- Added ``axvlines_kwds`` to parallel coordinates plot (:issue:`10709`)
2929

30+
- Option to ``.info()`` and ``.memory_usage()`` to provide for deep introspection of memory consumption. Note that this can be expensive to compute and therefore is an optional parameter. (:issue:``11595``)
31+
32+
.. ipython:: python
33+
34+
df = DataFrame({'A' : ['foo']*1000})
35+
df['B'] = df['A'].astype('category')
36+
37+
# shows the '+' as we have object dtypes
38+
df.info()
39+
40+
# we have an accurate memory assessment (but can be expensive to compute this)
41+
df.info(memory_usage='deep')
42+
3043
- ``Index`` now has ``fillna`` method (:issue:`10089`)
3144

3245
.. ipython:: python

pandas/core/base.py

+30
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,36 @@ def nunique(self, dropna=True):
489489
n -= 1
490490
return n
491491

492+
def memory_usage(self, deep=False):
493+
"""
494+
Memory usage of my values
495+
496+
Parameters
497+
----------
498+
deep : bool
499+
Introspect the data deeply, interrogate
500+
`object` dtypes for system-level memory consumption
501+
502+
Returns
503+
-------
504+
bytes used
505+
506+
Notes
507+
-----
508+
Memory usage does not include memory consumed by elements that
509+
are not components of the array if deep=False
510+
511+
See Also
512+
--------
513+
numpy.ndarray.nbytes
514+
"""
515+
if hasattr(self.values,'memory_usage'):
516+
return self.values.memory_usage(deep=deep)
517+
518+
v = self.values.nbytes
519+
if deep and com.is_object_dtype(self):
520+
v += lib.memory_usage_of_objects(self.values)
521+
return v
492522

493523
def factorize(self, sort=False, na_sentinel=-1):
494524
"""

pandas/core/categorical.py

+25
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,31 @@ def T(self):
924924
def nbytes(self):
925925
return self._codes.nbytes + self._categories.values.nbytes
926926

927+
def memory_usage(self, deep=False):
928+
"""
929+
Memory usage of my values
930+
931+
Parameters
932+
----------
933+
deep : bool
934+
Introspect the data deeply, interrogate
935+
`object` dtypes for system-level memory consumption
936+
937+
Returns
938+
-------
939+
bytes used
940+
941+
Notes
942+
-----
943+
Memory usage does not include memory consumed by elements that
944+
are not components of the array if deep=False
945+
946+
See Also
947+
--------
948+
numpy.ndarray.nbytes
949+
"""
950+
return self._codes.nbytes + self._categories.memory_usage(deep=deep)
951+
927952
def searchsorted(self, v, side='left', sorter=None):
928953
"""Find indices where elements should be inserted to maintain order.
929954

pandas/core/config_init.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,9 @@
215215
"""
216216

217217
pc_memory_usage_doc = """
218-
: bool or None
218+
: bool, string or None
219219
This specifies if the memory usage of a DataFrame should be displayed when
220-
df.info() is called.
220+
df.info() is called. Valid values True,False,'deep'
221221
"""
222222

223223
style_backup = dict()
@@ -292,7 +292,7 @@ def mpl_style_cb(key):
292292
cf.register_option('line_width', get_default_val('display.width'),
293293
pc_line_width_doc)
294294
cf.register_option('memory_usage', True, pc_memory_usage_doc,
295-
validator=is_instance_factory([type(None), bool]))
295+
validator=is_one_of_factory([None, True, False, 'deep']))
296296
cf.register_option('unicode.east_asian_width', False,
297297
pc_east_asian_width_doc, validator=is_bool)
298298
cf.register_option('unicode.ambiguous_as_wide', False,

pandas/core/frame.py

+25-14
Original file line numberDiff line numberDiff line change
@@ -1582,11 +1582,12 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co
15821582
max_cols : int, default None
15831583
Determines whether full summary or short summary is printed.
15841584
None follows the `display.max_info_columns` setting.
1585-
memory_usage : boolean, default None
1585+
memory_usage : boolean/string, default None
15861586
Specifies whether total memory usage of the DataFrame
15871587
elements (including index) should be displayed. None follows
15881588
the `display.memory_usage` setting. True or False overrides
1589-
the `display.memory_usage` setting. Memory usage is shown in
1589+
the `display.memory_usage` setting. A value of 'deep' is equivalent
1590+
of True, with deep introspection. Memory usage is shown in
15901591
human-readable units (base-2 representation).
15911592
null_counts : boolean, default None
15921593
Whether to show the non-null counts
@@ -1676,20 +1677,27 @@ def _sizeof_fmt(num, size_qualifier):
16761677
counts = self.get_dtype_counts()
16771678
dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
16781679
lines.append('dtypes: %s' % ', '.join(dtypes))
1680+
16791681
if memory_usage is None:
16801682
memory_usage = get_option('display.memory_usage')
1681-
if memory_usage: # append memory usage of df to display
1682-
# size_qualifier is just a best effort; not guaranteed to catch all
1683-
# cases (e.g., it misses categorical data even with object
1684-
# categories)
1685-
size_qualifier = ('+' if 'object' in counts
1686-
or is_object_dtype(self.index) else '')
1687-
mem_usage = self.memory_usage(index=True).sum()
1683+
if memory_usage:
1684+
# append memory usage of df to display
1685+
size_qualifier = ''
1686+
if memory_usage == 'deep':
1687+
deep=True
1688+
else:
1689+
# size_qualifier is just a best effort; not guaranteed to catch all
1690+
# cases (e.g., it misses categorical data even with object
1691+
# categories)
1692+
deep=False
1693+
if 'object' in counts or is_object_dtype(self.index):
1694+
size_qualifier = '+'
1695+
mem_usage = self.memory_usage(index=True, deep=deep).sum()
16881696
lines.append("memory usage: %s\n" %
16891697
_sizeof_fmt(mem_usage, size_qualifier))
16901698
_put_lines(buf, lines)
16911699

1692-
def memory_usage(self, index=False):
1700+
def memory_usage(self, index=False, deep=False):
16931701
"""Memory usage of DataFrame columns.
16941702
16951703
Parameters
@@ -1698,6 +1706,9 @@ def memory_usage(self, index=False):
16981706
Specifies whether to include memory usage of DataFrame's
16991707
index in returned Series. If `index=True` (default is False)
17001708
the first index of the Series is `Index`.
1709+
deep : bool
1710+
Introspect the data deeply, interrogate
1711+
`object` dtypes for system-level memory consumption
17011712
17021713
Returns
17031714
-------
@@ -1708,17 +1719,17 @@ def memory_usage(self, index=False):
17081719
Notes
17091720
-----
17101721
Memory usage does not include memory consumed by elements that
1711-
are not components of the array.
1722+
are not components of the array if deep=False
17121723
17131724
See Also
17141725
--------
17151726
numpy.ndarray.nbytes
17161727
"""
1717-
result = Series([ c.values.nbytes for col, c in self.iteritems() ],
1728+
result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ],
17181729
index=self.columns)
17191730
if index:
1720-
result = Series(self.index.nbytes,
1721-
index=['Index']).append(result)
1731+
result = Series(self.index.memory_usage(deep=deep),
1732+
index=['Index']).append(result)
17221733
return result
17231734

17241735
def transpose(self):

pandas/core/series.py

+29
Original file line numberDiff line numberDiff line change
@@ -2281,6 +2281,35 @@ def reindex_axis(self, labels, axis=0, **kwargs):
22812281
raise ValueError("cannot reindex series on non-zero axis!")
22822282
return self.reindex(index=labels, **kwargs)
22832283

2284+
def memory_usage(self, index=False, deep=False):
2285+
"""Memory usage of the Series
2286+
2287+
Parameters
2288+
----------
2289+
index : bool
2290+
Specifies whether to include memory usage of Series index
2291+
deep : bool
2292+
Introspect the data deeply, interrogate
2293+
`object` dtypes for system-level memory consumption
2294+
2295+
Returns
2296+
-------
2297+
scalar bytes of memory consumed
2298+
2299+
Notes
2300+
-----
2301+
Memory usage does not include memory consumed by elements that
2302+
are not components of the array if deep=False
2303+
2304+
See Also
2305+
--------
2306+
numpy.ndarray.nbytes
2307+
"""
2308+
v = super(Series, self).memory_usage(deep=deep)
2309+
if index:
2310+
v += self.index.memory_usage(deep=deep)
2311+
return v
2312+
22842313
def take(self, indices, axis=0, convert=True, is_copy=False):
22852314
"""
22862315
return Series corresponding to requested indices

pandas/lib.pyx

+13
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,19 @@ def ismember_int64(ndarray[int64_t] arr, set values):
182182

183183
return result.view(np.bool_)
184184

185+
@cython.wraparound(False)
186+
@cython.boundscheck(False)
187+
def memory_usage_of_objects(ndarray[object, ndim=1] arr):
188+
""" return the memory usage of an object array in bytes,
189+
does not include the actual bytes of the pointers """
190+
cdef Py_ssize_t i, n
191+
cdef int64_t s = 0
192+
193+
n = len(arr)
194+
for i from 0 <= i < n:
195+
s += arr[i].__sizeof__()
196+
return s
197+
185198
#----------------------------------------------------------------------
186199
# datetime / io related
187200

pandas/tests/test_base.py

+22
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,28 @@ def get_fill_value(obj):
877877
self.assertFalse(o is result)
878878

879879

880+
def test_memory_usage(self):
881+
for o in self.objs:
882+
res = o.memory_usage()
883+
res2 = o.memory_usage(deep=True)
884+
885+
if com.is_object_dtype(o):
886+
self.assertTrue(res2 > res)
887+
else:
888+
self.assertEqual(res, res2)
889+
890+
if isinstance(o, Series):
891+
res = o.memory_usage(index=True)
892+
res2 = o.memory_usage(index=True, deep=True)
893+
if com.is_object_dtype(o) or com.is_object_dtype(o.index):
894+
self.assertTrue(res2 > res)
895+
else:
896+
self.assertEqual(res, res2)
897+
898+
self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
899+
o.memory_usage(index=True))
900+
901+
880902
class TestFloat64HashTable(tm.TestCase):
881903
def test_lookup_nan(self):
882904
from pandas.hashtable import Float64HashTable

pandas/tests/test_categorical.py

+9
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,15 @@ def test_nbytes(self):
11971197
exp = cat._codes.nbytes + cat._categories.values.nbytes
11981198
self.assertEqual(cat.nbytes, exp)
11991199

1200+
def test_memory_usage(self):
1201+
cat = pd.Categorical([1,2,3])
1202+
self.assertEqual(cat.nbytes, cat.memory_usage())
1203+
self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))
1204+
1205+
cat = pd.Categorical(['foo','foo','bar'])
1206+
self.assertEqual(cat.nbytes, cat.memory_usage())
1207+
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)
1208+
12001209
def test_searchsorted(self):
12011210
# https://github.com/pydata/pandas/issues/8420
12021211
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])

pandas/tests/test_frame.py

+14
Original file line numberDiff line numberDiff line change
@@ -7614,6 +7614,17 @@ def test_info_memory_usage(self):
76147614
res = buf.getvalue().splitlines()
76157615
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))
76167616

7617+
df_with_object_index.info(buf=buf, memory_usage='deep')
7618+
res = buf.getvalue().splitlines()
7619+
self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))
7620+
7621+
self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() \
7622+
> df_with_object_index.memory_usage(index=True).sum())
7623+
7624+
df_object = pd.DataFrame({'a': ['a']})
7625+
self.assertTrue(df_object.memory_usage(deep=True).sum() \
7626+
> df_object.memory_usage().sum())
7627+
76177628
# Test a DataFrame with duplicate columns
76187629
dtypes = ['int64', 'int64', 'int64', 'float64']
76197630
data = {}
@@ -7630,6 +7641,9 @@ def test_info_memory_usage(self):
76307641
size_df = np.size(df.columns.values) # index=False; default
76317642
self.assertEqual(size_df, np.size(df.memory_usage()))
76327643

7644+
# assert deep works only on object
7645+
self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())
7646+
76337647
# test for validity
76347648
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
76357649
DataFrame(1,index=['a'],columns=['A']).index.nbytes

0 commit comments

Comments
 (0)