Merge pull request #7619 from asobrien/df-mem-info

jreback · jreback · commit d1c02710f5e7 · 2014-10-04T22:31:10.000-04:00
ENH: dataframe memory usage
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
@@ -24,6 +24,81 @@ Frequently Asked Questions (FAQ)
    options.display.mpl_style='default'
    from pandas.compat import lrange
 
+
+.. _df-memory-usage:
+
+DataFrame memory usage
+~~~~~~~~~~~~~~~~~~~~~~
+As of pandas version 0.15.0, the memory usage of a dataframe (including
+the index) is shown when accessing the ``info`` method of a dataframe. A
+configuration option, ``display.memory_usage`` (see :ref:`options`),
+specifies if the dataframe's memory usage will be displayed when
+invoking the df.info() method.
+
+For example, the memory usage of the dataframe below is shown
+when calling df.info():
+
+.. ipython:: python
+
+    dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
+                'complex128', 'object', 'bool']
+    n = 5000
+    data = dict([ (t, np.random.randint(100, size=n).astype(t))
+                    for t in dtypes])
+    df = DataFrame(data)
+
+    df.info()
+
+By default the display option is set to True but can be explicitly
+overridden by passing the memory_usage argument when invoking df.info().
+Note that ``memory_usage=None`` is the default value for the  df.info()
+method and follows the setting specified by display.memory_usage.
+
+The memory usage of each column can be found by calling the ``memory_usage``
+method. This returns a Series with an index represented by column names
+and memory usage of each column shown in bytes. For the dataframe above,
+the memory usage of each column and the total memory usage of the
+dataframe can be found with the memory_usage method:
+
+.. ipython:: python
+
+    df.memory_usage()
+
+    # total memory usage of dataframe
+    df.memory_usage().sum()
+
+By default the memory usage of the dataframe's index is not shown in the
+returned Series, the memory usage of the index can be shown by passing
+the ``index=True`` argument:
+
+.. ipython:: python
+
+    df.memory_usage(index=True)
+
+The memory usage displayed by the ``info`` method utilizes the
+``memory_usage`` method to determine the memory usage of a dataframe
+while also formatting the output in human-readable units (base-2
+representation; i.e., 1KB = 1024 bytes).
+
+Pandas version 0.15.0 introduces a new categorical data type (see
+:ref:`categorical`), which can be used in Series and DataFrames.
+Significant memory savings can be achieved when using the category
+datatype. This is demonstrated below:
+
+.. ipython:: python
+
+  df['bases_object'] = Series(np.array(['adenine', 'cytosine', 'guanine', 'thymine']).take(np.random.randint(0,4,size=len(df))))
+
+  df['bases_categorical'] = df['bases_object'].astype('category')
+
+  df.memory_usage()
+
+While the *base_object* and *bases_categorical* appear as identical
+columns in the dataframe, the memory savings of the categorical
+datatype, versus the object datatype, is revealed by ``memory_usage``.
+
+
+
 .. _ref-monkey-patching:
 
 Adding Features to your pandas Installation
diff --git a/doc/source/options.rst b/doc/source/options.rst
@@ -348,6 +348,9 @@ display.max_seq_items      100          when pretty-printing a long sequence,
                                         of "..." to the resulting string.
                                         If set to None, the number of items
                                         to be printed is unlimited.
+display.memory_usage       True         This specifies if the memory usage of
+                                        a DataFrame should be displayed when the
+                                        df.info() method is invoked.
 display.mpl_style          None         Setting this to 'default' will modify
                                         the rcParams used by matplotlib
                                         to give plots a more pleasing visual
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -259,6 +259,16 @@ API changes
 
 - ``DataFrame.plot`` and ``Series.plot`` keywords are now have consistent orders (:issue:`8037`)
 
+- Implements methods to find memory usage of a DataFrame (:issue:`6852`). A new display option ``display.memory_usage`` (see :ref:`options`) sets the default behavior of the ``memory_usage`` argument in the ``df.info()`` method; by default ``display.memory_usage`` is True but this can be overridden by explicitly passing the memory_usage argument to the df.info() method, as shown below. Additionally `memory_usage` is an available method for a dataframe object which returns the memory usage of each column (for more information see :ref:`df-memory-usage`):
+
+  .. ipython:: python
+
+     df = DataFrame({ 'float' : np.random.randn(1000), 'int' : np.random.randint(0,5,size=1000)})
+     df.memory_usage()
+
+     df.info(memory_usage=True)
+
+
 .. _whatsnew_0150.dt:
 
 .dt accessor
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -203,6 +203,12 @@
     Setting this to None/False restores the values to their initial value.
 """
 
+pc_memory_usage_doc = """
+: bool or None
+    This specifies if the memory usage of a DataFrame should be displayed when
+    df.info() is called.
+"""
+
 style_backup = dict()
 
 
@@ -274,6 +280,8 @@ def mpl_style_cb(key):
     # redirected to width, make defval identical
     cf.register_option('line_width', get_default_val('display.width'),
                        pc_line_width_doc)
+    cf.register_option('memory_usage', True, pc_memory_usage_doc,
+                        validator=is_instance_factory([type(None), bool]))
 
 cf.deprecate_option('display.line_width',
                     msg=pc_line_width_deprecation_warning,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1390,7 +1390,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
         if buf is None:
             return formatter.buf.getvalue()
 
-    def info(self, verbose=None, buf=None, max_cols=None):
+    def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None):
         """
         Concise summary of a DataFrame.
 
@@ -1404,6 +1404,12 @@ def info(self, verbose=None, buf=None, max_cols=None):
         max_cols : int, default None
             Determines whether full summary or short summary is printed.
             None follows the `display.max_info_columns` setting.
+        memory_usage : boolean, default None
+            Specifies whether total memory usage of the DataFrame
+            elements (including index) should be displayed. None follows
+            the `display.memory_usage` setting. True or False overrides
+            the `display.memory_usage` setting. Memory usage is shown in
+            human-readable units (base-2 representation).
         """
         from pandas.core.format import _put_lines
 
@@ -1462,6 +1468,14 @@ def _verbose_repr():
         def _non_verbose_repr():
             lines.append(self.columns.summary(name='Columns'))
 
+        def _sizeof_fmt(num):
+            # returns size in human readable format
+            for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
+                if num < 1024.0:
+                    return "%3.1f %s" % (num, x)
+                num /= 1024.0
+            return "%3.1f %s" % (num, 'PB')
+
         if verbose:
             _verbose_repr()
         elif verbose is False:  # specifically set to False, not nesc None
@@ -1474,9 +1488,46 @@ def _non_verbose_repr():
 
         counts = self.get_dtype_counts()
         dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))]
-        lines.append('dtypes: %s\n' % ', '.join(dtypes))
+        lines.append('dtypes: %s' % ', '.join(dtypes))
+        if memory_usage is None:
+            memory_usage = get_option('display.memory_usage')
+        if memory_usage:  # append memory usage of df to display
+            lines.append("memory usage: %s\n" %
+                            _sizeof_fmt(self.memory_usage(index=True).sum()))
         _put_lines(buf, lines)
 
+    def memory_usage(self, index=False):
+        """Memory usage of DataFrame columns.
+
+        Parameters
+        ----------
+        index : bool
+            Specifies whether to include memory usage of DataFrame's
+            index in returned Series. If `index=True` (default is False)
+            the first index of the Series is `Index`.
+
+        Returns
+        -------
+        sizes : Series
+            A series with column names as index and memory usage of
+            columns with units of bytes.
+
+        Notes
+        -----
+        Memory usage does not include memory consumed by elements that
+        are not components of the array.
+
+        See Also
+        --------
+        numpy.ndarray.nbytes
+        """
+        result = Series([ c.values.nbytes for col, c in self.iteritems() ],
+                        index=self.columns)
+        if index:
+             result = Series(self.index.values.nbytes,
+                        index=['Index']).append(result)
+        return result
+
     def transpose(self):
         """Transpose index and columns"""
         return super(DataFrame, self).transpose(1, 0)
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -43,7 +43,7 @@ def has_info_repr(df):
 def has_non_verbose_info_repr(df):
     has_info = has_info_repr(df)
     r = repr(df)
-    nv = len(r.split('\n')) == 5  # 1. <class>, 2. Index, 3. Columns, 4. dtype, 5. trailing newline
+    nv = len(r.split('\n')) == 6  # 1. <class>, 2. Index, 3. Columns, 4. dtype, 5. memory usage, 6. trailing newline
     return has_info and nv
 
 def has_horizontally_truncated_repr(df):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -6553,7 +6553,7 @@ def test_info_max_cols(self):
                 buf = StringIO()
                 df.info(buf=buf, verbose=verbose)
                 res = buf.getvalue()
-                self.assertEqual(len(res.split('\n')), len_)
+                self.assertEqual(len(res.strip().split('\n')), len_)
 
         for len_, verbose in [(10, None), (5, False), (10, True)]:
 
@@ -6562,23 +6562,57 @@ def test_info_max_cols(self):
                 buf = StringIO()
                 df.info(buf=buf, verbose=verbose)
                 res = buf.getvalue()
-                self.assertEqual(len(res.split('\n')), len_)
+                self.assertEqual(len(res.strip().split('\n')), len_)
 
         for len_, max_cols in [(10, 5), (5, 4)]:
             # setting truncates
             with option_context('max_info_columns', 4):
                 buf = StringIO()
                 df.info(buf=buf, max_cols=max_cols)
                 res = buf.getvalue()
-                self.assertEqual(len(res.split('\n')), len_)
+                self.assertEqual(len(res.strip().split('\n')), len_)
 
             # setting wouldn't truncate
             with option_context('max_info_columns', 5):
                 buf = StringIO()
                 df.info(buf=buf, max_cols=max_cols)
                 res = buf.getvalue()
-                self.assertEqual(len(res.split('\n')), len_)
+                self.assertEqual(len(res.strip().split('\n')), len_)
 
+    def test_info_memory_usage(self):
+        # Ensure memory usage is displayed, when asserted, on the last line
+        dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
+                  'complex128', 'object', 'bool']
+        data = {}
+        n = 10
+        for i, dtype in enumerate(dtypes):
+            data[i] = np.random.randint(2, size=n).astype(dtype)
+        df = DataFrame(data)
+        buf = StringIO()
+        # display memory usage case
+        df.info(buf=buf, memory_usage=True)
+        res = buf.getvalue().splitlines()
+        self.assertTrue("memory usage: " in res[-1])
+        # do not display memory usage cas
+        df.info(buf=buf, memory_usage=False)
+        res = buf.getvalue().splitlines()
+        self.assertTrue("memory usage: " not in res[-1])
+
+        # Test a DataFrame with duplicate columns
+        dtypes = ['int64', 'int64', 'int64', 'float64']
+        data = {}
+        n = 100
+        for i, dtype in enumerate(dtypes):
+            data[i] = np.random.randint(2, size=n).astype(dtype)
+        df = DataFrame(data)
+        df.columns = dtypes
+        # Ensure df size is as expected
+        df_size = df.memory_usage().sum()
+        exp_size = len(dtypes) * n * 8  # cols * rows * bytes
+        self.assertEqual(df_size, exp_size)
+        # Ensure number of cols in memory_usage is the same as df
+        size_df = np.size(df.columns.values)  # index=False; default
+        self.assertEqual(size_df, np.size(df.memory_usage()))
 
     def test_dtypes(self):
         self.mixed_frame['bool'] = self.mixed_frame['A'] > 0