Skip to content

Commit 60cacab

Browse files
max-sixtyjreback
authored andcommitted
index is included in memory usage by default, sys.getsizeof returns correct value, #11597
1 parent bc0a166 commit 60cacab

File tree

8 files changed

+104
-57
lines changed

8 files changed

+104
-57
lines changed

doc/source/faq.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,13 @@ dataframe can be found with the memory_usage method:
7676
# total memory usage of dataframe
7777
df.memory_usage().sum()
7878
79-
By default the memory usage of the dataframe's index is not shown in the
80-
returned Series, the memory usage of the index can be shown by passing
81-
the ``index=True`` argument:
79+
By default the memory usage of the dataframe's index is shown in the
80+
returned Series, the memory usage of the index can be suppressed by passing
81+
the ``index=False`` argument:
8282

8383
.. ipython:: python
8484
85-
df.memory_usage(index=True)
85+
df.memory_usage(index=False)
8686
8787
The memory usage displayed by the ``info`` method utilizes the
8888
``memory_usage`` method to determine the memory usage of a dataframe

doc/source/whatsnew/v0.18.0.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ Other enhancements
107107
- ``read_excel`` now supports s3 urls of the format ``s3://bucketname/filename`` (:issue:`11447`)
108108
- A simple version of ``Panel.round()`` is now implemented (:issue:`11763`)
109109
- For Python 3.x, ``round(DataFrame)``, ``round(Series)``, ``round(Panel)`` will work (:issue:`11763`)
110-
- ``Dataframe`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
110+
- ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
111+
- ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the
112+
values it contains (:issue:`11597`)
111113

112114
.. _whatsnew_0180.enhancements.rounding:
113115

@@ -283,6 +285,8 @@ of date strings is no longer supported and raises a ``ValueError``. (:issue:`118
283285
In [2]: s.between_time('20150101 07:00:00','20150101 09:00:00')
284286
ValueError: Cannot convert arg ['20150101 07:00:00'] to a time.
285287

288+
- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
289+
286290

287291

288292

pandas/core/base.py

+16
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,22 @@ def _reset_cache(self, key=None):
112112
else:
113113
self._cache.pop(key, None)
114114

115+
def __sizeof__(self):
116+
"""
117+
Generates the total memory usage for a object that returns
118+
either a value or Series of values
119+
"""
120+
if hasattr(self, 'memory_usage'):
121+
mem = self.memory_usage(deep=True)
122+
if not lib.isscalar(mem):
123+
mem = mem.sum()
124+
return int(mem)
125+
126+
# no memory_usage attribute, so fall back to
127+
# object's 'sizeof'
128+
return super(self, PandasObject).__sizeof__()
129+
130+
115131
class NoNewAttributesMixin(object):
116132
"""Mixin which prevents adding new attributes.
117133

pandas/core/frame.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import numpy as np
2424
import numpy.ma as ma
2525

26-
from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _not_none,
26+
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
2727
_default_index, _maybe_upcast, is_sequence,
2828
_infer_dtype_from_scalar, _values_from_object,
2929
is_list_like, _maybe_box_datetimelike,
@@ -46,8 +46,7 @@
4646
from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u,
4747
OrderedDict, raise_with_traceback)
4848
from pandas import compat
49-
from pandas.sparse.array import SparseArray
50-
from pandas.util.decorators import (cache_readonly, deprecate, Appender,
49+
from pandas.util.decorators import (deprecate, Appender,
5150
Substitution, deprecate_kwarg)
5251

5352
from pandas.tseries.period import PeriodIndex
@@ -1720,10 +1719,11 @@ def _sizeof_fmt(num, size_qualifier):
17201719
size_qualifier = '+'
17211720
mem_usage = self.memory_usage(index=True, deep=deep).sum()
17221721
lines.append("memory usage: %s\n" %
1723-
_sizeof_fmt(mem_usage, size_qualifier))
1722+
_sizeof_fmt(mem_usage, size_qualifier)
1723+
)
17241724
_put_lines(buf, lines)
17251725

1726-
def memory_usage(self, index=False, deep=False):
1726+
def memory_usage(self, index=True, deep=False):
17271727
"""Memory usage of DataFrame columns.
17281728
17291729
Parameters

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2335,7 +2335,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
23352335
raise ValueError("cannot reindex series on non-zero axis!")
23362336
return self.reindex(index=labels, **kwargs)
23372337

2338-
def memory_usage(self, index=False, deep=False):
2338+
def memory_usage(self, index=True, deep=False):
23392339
"""Memory usage of the Series
23402340
23412341
Parameters

pandas/tests/test_base.py

+33-25
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import print_function
3+
34
import re
5+
import sys
46
from datetime import datetime, timedelta
7+
58
import numpy as np
6-
import pandas.compat as compat
9+
710
import pandas as pd
8-
from pandas.compat import u, StringIO
9-
from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin
11+
import pandas.compat as compat
1012
import pandas.core.common as com
13+
import pandas.util.testing as tm
14+
from pandas import (Series, Index, DatetimeIndex,
15+
TimedeltaIndex, PeriodIndex, Timedelta)
16+
from pandas.compat import u, StringIO
17+
from pandas.core.base import (FrozenList, FrozenNDArray,
18+
PandasDelegate, NoNewAttributesMixin)
1119
from pandas.tseries.base import DatetimeIndexOpsMixin
12-
from pandas.util.testing import assertRaisesRegexp, assertIsInstance
13-
from pandas.tseries.common import is_datetimelike
14-
from pandas import Series, Index, Int64Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta
15-
import pandas.tslib as tslib
16-
from pandas import _np_version_under1p9
17-
import nose
20+
from pandas.util.testing import (assertRaisesRegexp,
21+
assertIsInstance)
1822

19-
import pandas.util.testing as tm
2023

2124
class CheckStringMixin(object):
2225
def test_string_methods_dont_fail(self):
@@ -112,7 +115,9 @@ def setUp(self):
112115
def test_shallow_copying(self):
113116
original = self.container.copy()
114117
assertIsInstance(self.container.view(), FrozenNDArray)
115-
self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray))
118+
self.assertFalse(isinstance(
119+
self.container.view(np.ndarray), FrozenNDArray
120+
))
116121
self.assertIsNot(self.container.view(), self.container)
117122
self.assert_numpy_array_equal(self.container, original)
118123
# shallow copy should be the same too
@@ -881,27 +886,30 @@ def get_fill_value(obj):
881886
# check shallow_copied
882887
self.assertFalse(o is result)
883888

884-
885889
def test_memory_usage(self):
886890
for o in self.objs:
887891
res = o.memory_usage()
888-
res2 = o.memory_usage(deep=True)
892+
res_deep = o.memory_usage(deep=True)
889893

890-
if com.is_object_dtype(o):
891-
self.assertTrue(res2 > res)
894+
if (com.is_object_dtype(o) or
895+
(isinstance(o, Series) and
896+
com.is_object_dtype(o.index))):
897+
# if there are objects, only deep will pick them up
898+
self.assertTrue(res_deep > res)
892899
else:
893-
self.assertEqual(res, res2)
900+
self.assertEqual(res, res_deep)
894901

895902
if isinstance(o, Series):
896-
res = o.memory_usage(index=True)
897-
res2 = o.memory_usage(index=True, deep=True)
898-
if com.is_object_dtype(o) or com.is_object_dtype(o.index):
899-
self.assertTrue(res2 > res)
900-
else:
901-
self.assertEqual(res, res2)
902-
903-
self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
904-
o.memory_usage(index=True))
903+
self.assertEqual(
904+
(o.memory_usage(index=False) +
905+
o.index.memory_usage()),
906+
o.memory_usage(index=True)
907+
)
908+
909+
# sys.getsizeof will call the .memory_usage with
910+
# deep=True, and add on some GC overhead
911+
diff = res_deep - sys.getsizeof(o)
912+
self.assertTrue(abs(diff) < 100)
905913

906914

907915
class TestFloat64HashTable(tm.TestCase):

pandas/tests/test_categorical.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
# -*- coding: utf-8 -*-
22
# pylint: disable=E1101,E1103,W0232
33

4-
from datetime import datetime
5-
from pandas.compat import range, lrange, u, PY3
64
import os
7-
import pickle
8-
import re
5+
import sys
6+
from datetime import datetime
97
from distutils.version import LooseVersion
108

119
import numpy as np
12-
import pandas as pd
1310

14-
from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex
15-
16-
from pandas.core.config import option_context
17-
import pandas.core.common as com
11+
import pandas as pd
1812
import pandas.compat as compat
13+
import pandas.core.common as com
1914
import pandas.util.testing as tm
15+
from pandas import (Categorical, Index, Series, DataFrame,
16+
PeriodIndex, Timestamp, CategoricalIndex)
17+
from pandas.compat import range, lrange, u, PY3
18+
from pandas.core.config import option_context
19+
2020

2121
class TestCategorical(tm.TestCase):
2222
_multiprocess_can_split_ = True
@@ -1219,10 +1219,17 @@ def test_memory_usage(self):
12191219
self.assertEqual(cat.nbytes, cat.memory_usage())
12201220
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)
12211221

1222+
# sys.getsizeof will call the .memory_usage with
1223+
# deep=True, and add on some GC overhead
1224+
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
1225+
self.assertTrue(abs(diff) < 100)
1226+
12221227
def test_searchsorted(self):
12231228
# https://github.com/pydata/pandas/issues/8420
1224-
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
1225-
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
1229+
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese',
1230+
'milk'])
1231+
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese',
1232+
'milk', 'donuts'])
12261233
c1 = pd.Categorical(s1, ordered=True)
12271234
c2 = pd.Categorical(s2, ordered=True)
12281235

pandas/tests/test_frame.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -7699,23 +7699,35 @@ def test_info_memory_usage(self):
76997699
df.columns = dtypes
77007700
# Ensure df size is as expected
77017701
df_size = df.memory_usage().sum()
7702-
exp_size = len(dtypes) * n * 8 # cols * rows * bytes
7702+
exp_size = (len(dtypes) + 1) * n * 8 # (cols + index) * rows * bytes
77037703
self.assertEqual(df_size, exp_size)
77047704
# Ensure number of cols in memory_usage is the same as df
7705-
size_df = np.size(df.columns.values) # index=False; default
7705+
size_df = np.size(df.columns.values) + 1 # index=True; default
77067706
self.assertEqual(size_df, np.size(df.memory_usage()))
77077707

77087708
# assert deep works only on object
7709-
self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())
7709+
self.assertEqual(df.memory_usage().sum(),
7710+
df.memory_usage(deep=True).sum())
77107711

77117712
# test for validity
7712-
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
7713-
DataFrame(1,index=['a'],columns=['A']).index.nbytes
7714-
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
7715-
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
7716-
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True)
7717-
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
7718-
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
7713+
DataFrame(1, index=['a'], columns=['A']
7714+
).memory_usage(index=True)
7715+
DataFrame(1, index=['a'], columns=['A']
7716+
).index.nbytes
7717+
df = DataFrame(
7718+
data=1,
7719+
index=pd.MultiIndex.from_product(
7720+
[['a'], range(1000)]),
7721+
columns=['A']
7722+
)
7723+
df.index.nbytes
7724+
df.memory_usage(index=True)
7725+
df.index.values.nbytes
7726+
7727+
# sys.getsizeof will call the .memory_usage with
7728+
# deep=True, and add on some GC overhead
7729+
diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
7730+
self.assertTrue(abs(diff) < 100)
77197731

77207732
def test_dtypes(self):
77217733
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0

0 commit comments

Comments
 (0)