index is included in memory usage by default, sys.getsizeof returns correct value, #11597

max-sixty · jreback · commit 60cacabdd37d · 2016-01-03T11:52:39.000-05:00
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
@@ -76,13 +76,13 @@ dataframe can be found with the memory_usage method:
     # total memory usage of dataframe
     df.memory_usage().sum()
 
-By default the memory usage of the dataframe's index is not shown in the
-returned Series, the memory usage of the index can be shown by passing
-the ``index=True`` argument:
+By default the memory usage of the dataframe's index is shown in the
+returned Series, the memory usage of the index can be suppressed by passing
+the ``index=False`` argument:
 
 .. ipython:: python
 
-    df.memory_usage(index=True)
+    df.memory_usage(index=False)
 
 The memory usage displayed by the ``info`` method utilizes the
 ``memory_usage`` method to determine the memory usage of a dataframe
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -107,7 +107,9 @@ Other enhancements
 - ``read_excel`` now supports s3 urls of the format ``s3://bucketname/filename`` (:issue:`11447`)
 - A simple version of ``Panel.round()`` is now implemented (:issue:`11763`)
 - For Python 3.x, ``round(DataFrame)``, ``round(Series)``, ``round(Panel)`` will work (:issue:`11763`)
-- ``Dataframe`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
+- ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
+- ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the
+  values it contains (:issue:`11597`)
 
 .. _whatsnew_0180.enhancements.rounding:
 
@@ -283,6 +285,8 @@ of date strings is no longer supported and raises a ``ValueError``. (:issue:`118
      In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00')
      ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
 
+- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
+
 
 
 
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -112,6 +112,22 @@ def _reset_cache(self, key=None):
         else:
             self._cache.pop(key, None)
 
+    def __sizeof__(self):
+        """
+        Generates the total memory usage for a object that returns
+         either a value or Series of values
+        """
+        if hasattr(self, 'memory_usage'):
+            mem = self.memory_usage(deep=True)
+            if not lib.isscalar(mem):
+                mem = mem.sum()
+            return int(mem)
+
+        # no memory_usage attribute, so fall back to
+        # object's 'sizeof'
+        return super(self, PandasObject).__sizeof__()
+
+
 class NoNewAttributesMixin(object):
     """Mixin which prevents adding new attributes.
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -23,7 +23,7 @@
 import numpy as np
 import numpy.ma as ma
 
-from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _not_none,
+from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
                                 _default_index, _maybe_upcast, is_sequence,
                                 _infer_dtype_from_scalar, _values_from_object,
                                 is_list_like, _maybe_box_datetimelike,
@@ -46,8 +46,7 @@
 from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u,
                           OrderedDict, raise_with_traceback)
 from pandas import compat
-from pandas.sparse.array import SparseArray
-from pandas.util.decorators import (cache_readonly, deprecate, Appender,
+from pandas.util.decorators import (deprecate, Appender,
                                     Substitution, deprecate_kwarg)
 
 from pandas.tseries.period import PeriodIndex
@@ -1720,10 +1719,11 @@ def _sizeof_fmt(num, size_qualifier):
                     size_qualifier = '+'
             mem_usage = self.memory_usage(index=True, deep=deep).sum()
             lines.append("memory usage: %s\n" %
-                            _sizeof_fmt(mem_usage, size_qualifier))
+                         _sizeof_fmt(mem_usage, size_qualifier)
+                         )
         _put_lines(buf, lines)
 
-    def memory_usage(self, index=False, deep=False):
+    def memory_usage(self, index=True, deep=False):
         """Memory usage of DataFrame columns.
 
         Parameters
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2335,7 +2335,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
             raise ValueError("cannot reindex series on non-zero axis!")
         return self.reindex(index=labels, **kwargs)
 
-    def memory_usage(self, index=False, deep=False):
+    def memory_usage(self, index=True, deep=False):
         """Memory usage of the Series
 
         Parameters
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -1,22 +1,25 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+
 import re
+import sys
 from datetime import datetime, timedelta
+
 import numpy as np
-import pandas.compat as compat
+
 import pandas as pd
-from pandas.compat import u, StringIO
-from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin
+import pandas.compat as compat
 import pandas.core.common as com
+import pandas.util.testing as tm
+from pandas import (Series, Index, DatetimeIndex,
+                    TimedeltaIndex, PeriodIndex, Timedelta)
+from pandas.compat import u, StringIO
+from pandas.core.base import (FrozenList, FrozenNDArray,
+                              PandasDelegate, NoNewAttributesMixin)
 from pandas.tseries.base import DatetimeIndexOpsMixin
-from pandas.util.testing import assertRaisesRegexp, assertIsInstance
-from pandas.tseries.common import is_datetimelike
-from pandas import Series, Index, Int64Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta
-import pandas.tslib as tslib
-from pandas import _np_version_under1p9
-import nose
+from pandas.util.testing import (assertRaisesRegexp,
+                                 assertIsInstance)
 
-import pandas.util.testing as tm
 
 class CheckStringMixin(object):
     def test_string_methods_dont_fail(self):
@@ -112,7 +115,9 @@ def setUp(self):
     def test_shallow_copying(self):
         original = self.container.copy()
         assertIsInstance(self.container.view(), FrozenNDArray)
-        self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray))
+        self.assertFalse(isinstance(
+            self.container.view(np.ndarray), FrozenNDArray
+        ))
         self.assertIsNot(self.container.view(), self.container)
         self.assert_numpy_array_equal(self.container, original)
         # shallow copy should be the same too
@@ -881,27 +886,30 @@ def get_fill_value(obj):
                 # check shallow_copied
                 self.assertFalse(o is result)
 
-
     def test_memory_usage(self):
         for o in self.objs:
             res = o.memory_usage()
-            res2 = o.memory_usage(deep=True)
+            res_deep = o.memory_usage(deep=True)
 
-            if com.is_object_dtype(o):
-                self.assertTrue(res2 > res)
+            if (com.is_object_dtype(o) or
+                    (isinstance(o, Series) and
+                        com.is_object_dtype(o.index))):
+                # if there are objects, only deep will pick them up
+                self.assertTrue(res_deep > res)
             else:
-                self.assertEqual(res, res2)
+                self.assertEqual(res, res_deep)
 
             if isinstance(o, Series):
-                res = o.memory_usage(index=True)
-                res2 = o.memory_usage(index=True, deep=True)
-                if com.is_object_dtype(o) or com.is_object_dtype(o.index):
-                    self.assertTrue(res2 > res)
-                else:
-                    self.assertEqual(res, res2)
-
-                self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
-                                 o.memory_usage(index=True))
+                self.assertEqual(
+                    (o.memory_usage(index=False) +
+                        o.index.memory_usage()),
+                    o.memory_usage(index=True)
+                )
+
+            # sys.getsizeof will call the .memory_usage with
+            # deep=True, and add on some GC overhead
+            diff = res_deep - sys.getsizeof(o)
+            self.assertTrue(abs(diff) < 100)
 
 
 class TestFloat64HashTable(tm.TestCase):
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1,22 +1,22 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=E1101,E1103,W0232
 
-from datetime import datetime
-from pandas.compat import range, lrange, u, PY3
 import os
-import pickle
-import re
+import sys
+from datetime import datetime
 from distutils.version import LooseVersion
 
 import numpy as np
-import pandas as pd
 
-from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex
-
-from pandas.core.config import option_context
-import pandas.core.common as com
+import pandas as pd
 import pandas.compat as compat
+import pandas.core.common as com
 import pandas.util.testing as tm
+from pandas import (Categorical, Index, Series, DataFrame,
+                    PeriodIndex, Timestamp, CategoricalIndex)
+from pandas.compat import range, lrange, u, PY3
+from pandas.core.config import option_context
+
 
 class TestCategorical(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -1219,10 +1219,17 @@ def test_memory_usage(self):
         self.assertEqual(cat.nbytes, cat.memory_usage())
         self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)
 
+        # sys.getsizeof will call the .memory_usage with
+        # deep=True, and add on some GC overhead
+        diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
+        self.assertTrue(abs(diff) < 100)
+
     def test_searchsorted(self):
         # https://github.com/pydata/pandas/issues/8420
-        s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
-        s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
+        s1 = pd.Series(['apple', 'bread', 'bread', 'cheese',
+                        'milk'])
+        s2 = pd.Series(['apple', 'bread', 'bread', 'cheese',
+                        'milk', 'donuts'])
         c1 = pd.Categorical(s1, ordered=True)
         c2 = pd.Categorical(s2, ordered=True)
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -7699,23 +7699,35 @@ def test_info_memory_usage(self):
         df.columns = dtypes
         # Ensure df size is as expected
         df_size = df.memory_usage().sum()
-        exp_size = len(dtypes) * n * 8  # cols * rows * bytes
+        exp_size = (len(dtypes) + 1) * n * 8  # (cols + index) * rows * bytes
         self.assertEqual(df_size, exp_size)
         # Ensure number of cols in memory_usage is the same as df
-        size_df = np.size(df.columns.values)  # index=False; default
+        size_df = np.size(df.columns.values) + 1  # index=True; default
         self.assertEqual(size_df, np.size(df.memory_usage()))
 
         # assert deep works only on object
-        self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())
+        self.assertEqual(df.memory_usage().sum(),
+                         df.memory_usage(deep=True).sum())
 
         # test for validity
-        DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
-        DataFrame(1,index=['a'],columns=['A']).index.nbytes
-        DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
-        DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
-        DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True)
-        DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
-        DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
+        DataFrame(1, index=['a'], columns=['A']
+                  ).memory_usage(index=True)
+        DataFrame(1, index=['a'], columns=['A']
+                  ).index.nbytes
+        df = DataFrame(
+            data=1,
+            index=pd.MultiIndex.from_product(
+                [['a'], range(1000)]),
+            columns=['A']
+        )
+        df.index.nbytes
+        df.memory_usage(index=True)
+        df.index.values.nbytes
+
+        # sys.getsizeof will call the .memory_usage with
+        # deep=True, and add on some GC overhead
+        diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
+        self.assertTrue(abs(diff) < 100)
 
     def test_dtypes(self):
         self.mixed_frame['bool'] = self.mixed_frame['A'] > 0