Merge pull request #11596 from jreback/memory

jreback · jreback · commit ddd03722fa8e · 2015-11-13T16:34:59.000-05:00
PERF/DOC: Option to .info() and .memory_usage() to provide for deep introspection of memory consumption #11595
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -284,6 +284,7 @@ Attributes
    Series.itemsize
    Series.base
    Series.T
+   Series.memory_usage
 
 Conversion
 ~~~~~~~~~~
@@ -772,6 +773,7 @@ Attributes and underlying data
    DataFrame.ndim
    DataFrame.size
    DataFrame.shape
+   DataFrame.memory_usage
 
 Conversion
 ~~~~~~~~~~
@@ -1333,6 +1335,7 @@ Attributes
    Index.itemsize
    Index.base
    Index.T
+   Index.memory_usage
 
 Modifying and Computations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
@@ -50,6 +50,16 @@ The ``+`` symbol indicates that the true memory usage could be higher, because
 pandas does not count the memory used by values in columns with
 ``dtype=object``.
 
+.. versionadded:: 0.17.1
+
+Passing ``memory_usage='deep'`` will enable a more accurate memory usage report,
+that accounts for the full usage of the contained objects. This is optional
+as it can be expensive to do this deeper introspection.
+
+.. ipython:: python
+
+   df.info(memory_usage='deep')
+
 By default the display option is set to ``True`` but can be explicitly
 overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.
 
diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -27,6 +27,19 @@ Enhancements
 - Improve the error message displayed in :func:`pandas.io.gbq.to_gbq` when the DataFrame does not match the schema of the destination table (:issue:`11359`)
 - Added ``axvlines_kwds`` to parallel coordinates plot (:issue:`10709`)
 
+- Option to ``.info()`` and ``.memory_usage()`` to provide for deep introspection of memory consumption. Note that this can be expensive to compute and therefore is an optional parameter. (:issue:``11595``)
+
+.. ipython:: python
+
+   df = DataFrame({'A' : ['foo']*1000})
+   df['B'] = df['A'].astype('category')
+
+   # shows the '+' as we have object dtypes
+   df.info()
+
+   # we have an accurate memory assessment (but can be expensive to compute this)
+   df.info(memory_usage='deep')
+
 - ``Index`` now has ``fillna`` method (:issue:`10089`)
 
 .. ipython:: python
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -489,6 +489,36 @@ def nunique(self, dropna=True):
             n -= 1
         return n
 
+    def memory_usage(self, deep=False):
+        """
+        Memory usage of my values
+
+        Parameters
+        ----------
+        deep : bool
+            Introspect the data deeply, interrogate
+            `object` dtypes for system-level memory consumption
+
+        Returns
+        -------
+        bytes used
+
+        Notes
+        -----
+        Memory usage does not include memory consumed by elements that
+        are not components of the array if deep=False
+
+        See Also
+        --------
+        numpy.ndarray.nbytes
+        """
+        if hasattr(self.values,'memory_usage'):
+            return self.values.memory_usage(deep=deep)
+
+        v = self.values.nbytes
+        if deep and com.is_object_dtype(self):
+            v += lib.memory_usage_of_objects(self.values)
+        return v
 
     def factorize(self, sort=False, na_sentinel=-1):
         """
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -924,6 +924,31 @@ def T(self):
     def nbytes(self):
         return self._codes.nbytes + self._categories.values.nbytes
 
+    def memory_usage(self, deep=False):
+        """
+        Memory usage of my values
+
+        Parameters
+        ----------
+        deep : bool
+            Introspect the data deeply, interrogate
+            `object` dtypes for system-level memory consumption
+
+        Returns
+        -------
+        bytes used
+
+        Notes
+        -----
+        Memory usage does not include memory consumed by elements that
+        are not components of the array if deep=False
+
+        See Also
+        --------
+        numpy.ndarray.nbytes
+        """
+        return self._codes.nbytes + self._categories.memory_usage(deep=deep)
+
     def searchsorted(self, v, side='left', sorter=None):
         """Find indices where elements should be inserted to maintain order.
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -215,9 +215,9 @@
 """
 
 pc_memory_usage_doc = """
-: bool or None
+: bool, string or None
     This specifies if the memory usage of a DataFrame should be displayed when
-    df.info() is called.
+    df.info() is called. Valid values True,False,'deep'
 """
 
 style_backup = dict()
@@ -292,7 +292,7 @@ def mpl_style_cb(key):
     cf.register_option('line_width', get_default_val('display.width'),
                        pc_line_width_doc)
     cf.register_option('memory_usage', True, pc_memory_usage_doc,
-                        validator=is_instance_factory([type(None), bool]))
+                        validator=is_one_of_factory([None, True, False, 'deep']))
     cf.register_option('unicode.east_asian_width', False,
                        pc_east_asian_width_doc, validator=is_bool)
     cf.register_option('unicode.ambiguous_as_wide', False,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1582,11 +1582,12 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co
         max_cols : int, default None
             Determines whether full summary or short summary is printed.
             None follows the `display.max_info_columns` setting.
-        memory_usage : boolean, default None
+        memory_usage : boolean/string, default None
             Specifies whether total memory usage of the DataFrame
             elements (including index) should be displayed. None follows
             the `display.memory_usage` setting. True or False overrides
-            the `display.memory_usage` setting. Memory usage is shown in
+            the `display.memory_usage` setting. A value of 'deep' is equivalent
+            of True, with deep introspection. Memory usage is shown in
             human-readable units (base-2 representation).
         null_counts : boolean, default None
             Whether to show the non-null counts
@@ -1676,20 +1677,27 @@ def _sizeof_fmt(num, size_qualifier):
         counts = self.get_dtype_counts()
         dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
         lines.append('dtypes: %s' % ', '.join(dtypes))
+
         if memory_usage is None:
             memory_usage = get_option('display.memory_usage')
-        if memory_usage:  # append memory usage of df to display
-            # size_qualifier is just a best effort; not guaranteed to catch all
-            # cases (e.g., it misses categorical data even with object
-            # categories)
-            size_qualifier = ('+' if 'object' in counts
-                              or is_object_dtype(self.index) else '')
-            mem_usage = self.memory_usage(index=True).sum()
+        if memory_usage:
+            # append memory usage of df to display
+            size_qualifier = ''
+            if memory_usage == 'deep':
+                deep=True
+            else:
+                # size_qualifier is just a best effort; not guaranteed to catch all
+                # cases (e.g., it misses categorical data even with object
+                # categories)
+                deep=False
+                if 'object' in counts or is_object_dtype(self.index):
+                    size_qualifier = '+'
+            mem_usage = self.memory_usage(index=True, deep=deep).sum()
             lines.append("memory usage: %s\n" %
                             _sizeof_fmt(mem_usage, size_qualifier))
         _put_lines(buf, lines)
 
-    def memory_usage(self, index=False):
+    def memory_usage(self, index=False, deep=False):
         """Memory usage of DataFrame columns.
 
         Parameters
@@ -1698,6 +1706,9 @@ def memory_usage(self, index=False):
             Specifies whether to include memory usage of DataFrame's
             index in returned Series. If `index=True` (default is False)
             the first index of the Series is `Index`.
+        deep : bool
+            Introspect the data deeply, interrogate
+            `object` dtypes for system-level memory consumption
 
         Returns
         -------
@@ -1708,17 +1719,17 @@ def memory_usage(self, index=False):
         Notes
         -----
         Memory usage does not include memory consumed by elements that
-        are not components of the array.
+        are not components of the array if deep=False
 
         See Also
         --------
         numpy.ndarray.nbytes
         """
-        result = Series([ c.values.nbytes for col, c in self.iteritems() ],
+        result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ],
                         index=self.columns)
         if index:
-             result = Series(self.index.nbytes,
-                        index=['Index']).append(result)
+             result = Series(self.index.memory_usage(deep=deep),
+                             index=['Index']).append(result)
         return result
 
     def transpose(self):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2281,6 +2281,35 @@ def reindex_axis(self, labels, axis=0, **kwargs):
             raise ValueError("cannot reindex series on non-zero axis!")
         return self.reindex(index=labels, **kwargs)
 
+    def memory_usage(self, index=False, deep=False):
+        """Memory usage of the Series
+
+        Parameters
+        ----------
+        index : bool
+            Specifies whether to include memory usage of Series index
+        deep : bool
+            Introspect the data deeply, interrogate
+            `object` dtypes for system-level memory consumption
+
+        Returns
+        -------
+        scalar bytes of memory consumed
+
+        Notes
+        -----
+        Memory usage does not include memory consumed by elements that
+        are not components of the array if deep=False
+
+        See Also
+        --------
+        numpy.ndarray.nbytes
+        """
+        v = super(Series, self).memory_usage(deep=deep)
+        if index:
+            v += self.index.memory_usage(deep=deep)
+        return v
+
     def take(self, indices, axis=0, convert=True, is_copy=False):
         """
         return Series corresponding to requested indices
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -182,6 +182,19 @@ def ismember_int64(ndarray[int64_t] arr, set values):
 
     return result.view(np.bool_)
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def memory_usage_of_objects(ndarray[object, ndim=1] arr):
+    """ return the memory usage of an object array in bytes,
+    does not include the actual bytes of the pointers """
+    cdef Py_ssize_t i, n
+    cdef int64_t s = 0
+
+    n = len(arr)
+    for i from 0 <= i < n:
+        s += arr[i].__sizeof__()
+    return s
+
 #----------------------------------------------------------------------
 # datetime / io related
 
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -877,6 +877,28 @@ def get_fill_value(obj):
                 self.assertFalse(o is result)
 
 
+    def test_memory_usage(self):
+        for o in self.objs:
+            res = o.memory_usage()
+            res2 = o.memory_usage(deep=True)
+
+            if com.is_object_dtype(o):
+                self.assertTrue(res2 > res)
+            else:
+                self.assertEqual(res, res2)
+
+            if isinstance(o, Series):
+                res = o.memory_usage(index=True)
+                res2 = o.memory_usage(index=True, deep=True)
+                if com.is_object_dtype(o) or com.is_object_dtype(o.index):
+                    self.assertTrue(res2 > res)
+                else:
+                    self.assertEqual(res, res2)
+
+                self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
+                                 o.memory_usage(index=True))
+
+
 class TestFloat64HashTable(tm.TestCase):
     def test_lookup_nan(self):
         from pandas.hashtable import Float64HashTable
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1197,6 +1197,15 @@ def test_nbytes(self):
         exp = cat._codes.nbytes + cat._categories.values.nbytes
         self.assertEqual(cat.nbytes, exp)
 
+    def test_memory_usage(self):
+        cat = pd.Categorical([1,2,3])
+        self.assertEqual(cat.nbytes, cat.memory_usage())
+        self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))
+
+        cat = pd.Categorical(['foo','foo','bar'])
+        self.assertEqual(cat.nbytes, cat.memory_usage())
+        self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)
+
     def test_searchsorted(self):
         # https://github.com/pydata/pandas/issues/8420
         s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -7614,6 +7614,17 @@ def test_info_memory_usage(self):
         res = buf.getvalue().splitlines()
         self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))
 
+        df_with_object_index.info(buf=buf, memory_usage='deep')
+        res = buf.getvalue().splitlines()
+        self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))
+
+        self.assertTrue(df_with_object_index.memory_usage(index=True, deep=True).sum() \
+                        > df_with_object_index.memory_usage(index=True).sum())
+
+        df_object = pd.DataFrame({'a': ['a']})
+        self.assertTrue(df_object.memory_usage(deep=True).sum() \
+                        > df_object.memory_usage().sum())
+
         # Test a DataFrame with duplicate columns
         dtypes = ['int64', 'int64', 'int64', 'float64']
         data = {}
@@ -7630,6 +7641,9 @@ def test_info_memory_usage(self):
         size_df = np.size(df.columns.values)  # index=False; default
         self.assertEqual(size_df, np.size(df.memory_usage()))
 
+        # assert deep works only on object
+        self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())
+
         # test for validity
         DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
         DataFrame(1,index=['a'],columns=['A']).index.nbytes