From df2418774861111238bdca1edfb8908fe9db3da9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 22 Oct 2014 00:30:54 -0700 Subject: [PATCH] Use '+' to qualify memory usage in df.info() if appropriate This should resolve @kay1793's complaint in #8578. --- doc/source/faq.rst | 4 ++++ doc/source/v0.15.1.txt | 2 ++ pandas/core/frame.py | 14 ++++++++++---- pandas/tests/test_frame.py | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/faq.rst b/doc/source/faq.rst index 2befb22ab5de4..b93e5ae9c922a 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -50,6 +50,10 @@ when calling ``df.info()``: df.info() +The ``+`` symbol indicates that the true memory usage could be higher, because +pandas does not count the memory used by values in columns with +``dtype=object``. + By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. diff --git a/doc/source/v0.15.1.txt b/doc/source/v0.15.1.txt index 2bd88531f92d9..fa1b8b24e75b5 100644 --- a/doc/source/v0.15.1.txt +++ b/doc/source/v0.15.1.txt @@ -28,6 +28,8 @@ Enhancements - Added option to select columns when importing Stata files (:issue:`7935`) +- Qualify memory usage in ``DataFrame.info()`` by adding ``+`` if it is a lower bound (:issue:`8578`) + .. _whatsnew_0151.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c172d6fe1af0..d90ef76ddfa5e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1478,13 +1478,13 @@ def _verbose_repr(): def _non_verbose_repr(): lines.append(self.columns.summary(name='Columns')) - def _sizeof_fmt(num): + def _sizeof_fmt(num, size_qualifier): # returns size in human readable format for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: if num < 1024.0: - return "%3.1f %s" % (num, x) + return "%3.1f%s %s" % (num, size_qualifier, x) num /= 1024.0 - return "%3.1f %s" % (num, 'PB') + return "%3.1f%s %s" % (num, size_qualifier, 'PB') if verbose: _verbose_repr() @@ -1502,8 +1502,14 @@ def _sizeof_fmt(num): if memory_usage is None: memory_usage = get_option('display.memory_usage') if memory_usage: # append memory usage of df to display + # size_qualifier is just a best effort; not guaranteed to catch all + # cases (e.g., it misses categorical data even with object + # categories) + size_qualifier = ('+' if 'object' in counts + or self.index.dtype.kind == 'O' else '') + mem_usage = self.memory_usage(index=True).sum() lines.append("memory usage: %s\n" % - _sizeof_fmt(self.memory_usage(index=True).sum())) + _sizeof_fmt(mem_usage, size_qualifier)) _put_lines(buf, lines) def memory_usage(self, index=False): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e5064544b292e..3f4d825a4b82e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6732,6 +6732,21 @@ def test_info_memory_usage(self): res = buf.getvalue().splitlines() self.assertTrue("memory usage: " not in res[-1]) + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + # memory usage is a lower bound, so print it as XYZ+ MB + self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + # excluded column with object dtype, so estimate is accurate + self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) + + df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) + # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {}