Skip to content

Commit 8747633

Browse files
committed
Merge pull request #8599 from shoyer/qualify-mem-usage
Use '+' to qualify memory usage in df.info() if appropriate
2 parents 3c1185f + df24187 commit 8747633

File tree

4 files changed

+31
-4
lines changed

4 files changed

+31
-4
lines changed

doc/source/faq.rst

+4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ when calling ``df.info()``:
5050
5151
df.info()
5252
53+
The ``+`` symbol indicates that the true memory usage could be higher, because
54+
pandas does not count the memory used by values in columns with
55+
``dtype=object``.
56+
5357
By default the display option is set to ``True`` but can be explicitly
5458
overridden by passing the ``memory_usage`` argument when invoking ``df.info()``.
5559

doc/source/v0.15.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ Enhancements
2828

2929
- Added option to select columns when importing Stata files (:issue:`7935`)
3030

31+
- Qualify memory usage in ``DataFrame.info()`` by adding ``+`` if it is a lower bound (:issue:`8578`)
32+
3133

3234
.. _whatsnew_0151.performance:
3335

pandas/core/frame.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -1478,13 +1478,13 @@ def _verbose_repr():
14781478
def _non_verbose_repr():
14791479
lines.append(self.columns.summary(name='Columns'))
14801480

1481-
def _sizeof_fmt(num):
1481+
def _sizeof_fmt(num, size_qualifier):
14821482
# returns size in human readable format
14831483
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
14841484
if num < 1024.0:
1485-
return "%3.1f %s" % (num, x)
1485+
return "%3.1f%s %s" % (num, size_qualifier, x)
14861486
num /= 1024.0
1487-
return "%3.1f %s" % (num, 'PB')
1487+
return "%3.1f%s %s" % (num, size_qualifier, 'PB')
14881488

14891489
if verbose:
14901490
_verbose_repr()
@@ -1502,8 +1502,14 @@ def _sizeof_fmt(num):
15021502
if memory_usage is None:
15031503
memory_usage = get_option('display.memory_usage')
15041504
if memory_usage: # append memory usage of df to display
1505+
# size_qualifier is just a best effort; not guaranteed to catch all
1506+
# cases (e.g., it misses categorical data even with object
1507+
# categories)
1508+
size_qualifier = ('+' if 'object' in counts
1509+
or self.index.dtype.kind == 'O' else '')
1510+
mem_usage = self.memory_usage(index=True).sum()
15051511
lines.append("memory usage: %s\n" %
1506-
_sizeof_fmt(self.memory_usage(index=True).sum()))
1512+
_sizeof_fmt(mem_usage, size_qualifier))
15071513
_put_lines(buf, lines)
15081514

15091515
def memory_usage(self, index=False):

pandas/tests/test_frame.py

+15
Original file line numberDiff line numberDiff line change
@@ -6732,6 +6732,21 @@ def test_info_memory_usage(self):
67326732
res = buf.getvalue().splitlines()
67336733
self.assertTrue("memory usage: " not in res[-1])
67346734

6735+
df.info(buf=buf, memory_usage=True)
6736+
res = buf.getvalue().splitlines()
6737+
# memory usage is a lower bound, so print it as XYZ+ MB
6738+
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))
6739+
6740+
df.iloc[:, :5].info(buf=buf, memory_usage=True)
6741+
res = buf.getvalue().splitlines()
6742+
# excluded column with object dtype, so estimate is accurate
6743+
self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1]))
6744+
6745+
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
6746+
df_with_object_index.info(buf=buf, memory_usage=True)
6747+
res = buf.getvalue().splitlines()
6748+
self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))
6749+
67356750
# Test a DataFrame with duplicate columns
67366751
dtypes = ['int64', 'int64', 'int64', 'float64']
67376752
data = {}

0 commit comments

Comments
 (0)