Skip to content

Use '+' to qualify memory usage in df.info() if appropriate #8599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ when calling ``df.info()``:

df.info()

The ``+`` symbol indicates that the true memory usage could be higher, because
pandas does not count the memory used by values in columns with
``dtype=object``.

By default the display option is set to ``True`` but can be explicitly
overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.

Expand Down
2 changes: 2 additions & 0 deletions doc/source/v0.15.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Enhancements

- Added option to select columns when importing Stata files (:issue:`7935`)

- Qualify memory usage in ``DataFrame.info()`` by adding ``+`` if it is a lower bound (:issue:`8578`)


.. _whatsnew_0151.performance:

Expand Down
14 changes: 10 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1478,13 +1478,13 @@ def _verbose_repr():
def _non_verbose_repr():
lines.append(self.columns.summary(name='Columns'))

def _sizeof_fmt(num):
def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
return "%3.1f%s %s" % (num, size_qualifier, x)
num /= 1024.0
return "%3.1f %s" % (num, 'PB')
return "%3.1f%s %s" % (num, size_qualifier, 'PB')

if verbose:
_verbose_repr()
Expand All @@ -1502,8 +1502,14 @@ def _sizeof_fmt(num):
if memory_usage is None:
memory_usage = get_option('display.memory_usage')
if memory_usage: # append memory usage of df to display
# size_qualifier is just a best effort; not guaranteed to catch all
# cases (e.g., it misses categorical data even with object
# categories)
size_qualifier = ('+' if 'object' in counts
or self.index.dtype.kind == 'O' else '')
mem_usage = self.memory_usage(index=True).sum()
lines.append("memory usage: %s\n" %
_sizeof_fmt(self.memory_usage(index=True).sum()))
_sizeof_fmt(mem_usage, size_qualifier))
_put_lines(buf, lines)

def memory_usage(self, index=False):
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6732,6 +6732,21 @@ def test_info_memory_usage(self):
res = buf.getvalue().splitlines()
self.assertTrue("memory usage: " not in res[-1])

df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1]))

df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
Expand Down