diff --git a/doc/source/faq.rst b/doc/source/faq.rst index a613d53218ce2..259243638ac74 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -24,6 +24,81 @@ Frequently Asked Questions (FAQ) options.display.mpl_style='default' from pandas.compat import lrange + +.. _df-memory-usage: + +DataFrame memory usage +~~~~~~~~~~~~~~~~~~~~~~ +As of pandas version 0.15.0, the memory usage of a dataframe (including +the index) is shown when accessing the ``info`` method of a dataframe. A +configuration option, ``display.memory_usage`` (see :ref:`options`), +specifies if the dataframe's memory usage will be displayed when +invoking the df.info() method. + +For example, the memory usage of the dataframe below is shown +when calling df.info(): + +.. ipython:: python + + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + n = 5000 + data = dict([ (t, np.random.randint(100, size=n).astype(t)) + for t in dtypes]) + df = DataFrame(data) + + df.info() + +By default the display option is set to True but can be explicitly +overridden by passing the memory_usage argument when invoking df.info(). +Note that ``memory_usage=None`` is the default value for the df.info() +method and follows the setting specified by display.memory_usage. + +The memory usage of each column can be found by calling the ``memory_usage`` +method. This returns a Series with an index represented by column names +and memory usage of each column shown in bytes. For the dataframe above, +the memory usage of each column and the total memory usage of the +dataframe can be found with the memory_usage method: + +.. ipython:: python + + df.memory_usage() + + # total memory usage of dataframe + df.memory_usage().sum() + +By default the memory usage of the dataframe's index is not shown in the +returned Series, the memory usage of the index can be shown by passing +the ``index=True`` argument: + +.. ipython:: python + + df.memory_usage(index=True) + +The memory usage displayed by the ``info`` method utilizes the +``memory_usage`` method to determine the memory usage of a dataframe +while also formatting the output in human-readable units (base-2 +representation; i.e., 1KB = 1024 bytes). + +Pandas version 0.15.0 introduces a new categorical data type (see +:ref:`categorical`), which can be used in Series and DataFrames. +Significant memory savings can be achieved when using the category +datatype. This is demonstrated below: + +.. ipython:: python + + df['bases_object'] = Series(np.array(['adenine', 'cytosine', 'guanine', 'thymine']).take(np.random.randint(0,4,size=len(df)))) + + df['bases_categorical'] = df['bases_object'].astype('category') + + df.memory_usage() + +While the *base_object* and *bases_categorical* appear as identical +columns in the dataframe, the memory savings of the categorical +datatype, versus the object datatype, is revealed by ``memory_usage``. + + + .. _ref-monkey-patching: Adding Features to your pandas Installation diff --git a/doc/source/options.rst b/doc/source/options.rst index 95a137fb96e66..5edd28e559bc1 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -348,6 +348,9 @@ display.max_seq_items 100 when pretty-printing a long sequence, of "..." to the resulting string. If set to None, the number of items to be printed is unlimited. +display.memory_usage True This specifies if the memory usage of + a DataFrame should be displayed when the + df.info() method is invoked. display.mpl_style None Setting this to 'default' will modify the rcParams used by matplotlib to give plots a more pleasing visual diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1d9acadb68e58..78e511cecba6e 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -259,6 +259,16 @@ API changes - ``DataFrame.plot`` and ``Series.plot`` keywords are now have consistent orders (:issue:`8037`) +- Implements methods to find memory usage of a DataFrame (:issue:`6852`). A new display option ``display.memory_usage`` (see :ref:`options`) sets the default behavior of the ``memory_usage`` argument in the ``df.info()`` method; by default ``display.memory_usage`` is True but this can be overridden by explicitly passing the memory_usage argument to the df.info() method, as shown below. Additionally `memory_usage` is an available method for a dataframe object which returns the memory usage of each column (for more information see :ref:`df-memory-usage`): + + .. ipython:: python + + df = DataFrame({ 'float' : np.random.randn(1000), 'int' : np.random.randint(0,5,size=1000)}) + df.memory_usage() + + df.info(memory_usage=True) + + .. _whatsnew_0150.dt: .dt accessor diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 1d93b9d5e69c1..a56d3b93d87da 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -203,6 +203,12 @@ Setting this to None/False restores the values to their initial value. """ +pc_memory_usage_doc = """ +: bool or None + This specifies if the memory usage of a DataFrame should be displayed when + df.info() is called. +""" + style_backup = dict() @@ -274,6 +280,8 @@ def mpl_style_cb(key): # redirected to width, make defval identical cf.register_option('line_width', get_default_val('display.width'), pc_line_width_doc) + cf.register_option('memory_usage', True, pc_memory_usage_doc, + validator=is_instance_factory([type(None), bool])) cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5cfb2affe5a7b..65f7d56f5aa8a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1390,7 +1390,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, if buf is None: return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None): + def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None): """ Concise summary of a DataFrame. @@ -1404,6 +1404,12 @@ def info(self, verbose=None, buf=None, max_cols=None): max_cols : int, default None Determines whether full summary or short summary is printed. None follows the `display.max_info_columns` setting. + memory_usage : boolean, default None + Specifies whether total memory usage of the DataFrame + elements (including index) should be displayed. None follows + the `display.memory_usage` setting. True or False overrides + the `display.memory_usage` setting. Memory usage is shown in + human-readable units (base-2 representation). """ from pandas.core.format import _put_lines @@ -1462,6 +1468,14 @@ def _verbose_repr(): def _non_verbose_repr(): lines.append(self.columns.summary(name='Columns')) + def _sizeof_fmt(num): + # returns size in human readable format + for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: + if num < 1024.0: + return "%3.1f %s" % (num, x) + num /= 1024.0 + return "%3.1f %s" % (num, 'PB') + if verbose: _verbose_repr() elif verbose is False: # specifically set to False, not nesc None @@ -1474,9 +1488,46 @@ def _non_verbose_repr(): counts = self.get_dtype_counts() dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))] - lines.append('dtypes: %s\n' % ', '.join(dtypes)) + lines.append('dtypes: %s' % ', '.join(dtypes)) + if memory_usage is None: + memory_usage = get_option('display.memory_usage') + if memory_usage: # append memory usage of df to display + lines.append("memory usage: %s\n" % + _sizeof_fmt(self.memory_usage(index=True).sum())) _put_lines(buf, lines) + def memory_usage(self, index=False): + """Memory usage of DataFrame columns. + + Parameters + ---------- + index : bool + Specifies whether to include memory usage of DataFrame's + index in returned Series. If `index=True` (default is False) + the first index of the Series is `Index`. + + Returns + ------- + sizes : Series + A series with column names as index and memory usage of + columns with units of bytes. + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array. + + See Also + -------- + numpy.ndarray.nbytes + """ + result = Series([ c.values.nbytes for col, c in self.iteritems() ], + index=self.columns) + if index: + result = Series(self.index.values.nbytes, + index=['Index']).append(result) + return result + def transpose(self): """Transpose index and columns""" return super(DataFrame, self).transpose(1, 0) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 9216b7a286c54..7d4ee05a1e64f 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -43,7 +43,7 @@ def has_info_repr(df): def has_non_verbose_info_repr(df): has_info = has_info_repr(df) r = repr(df) - nv = len(r.split('\n')) == 5 # 1. , 2. Index, 3. Columns, 4. dtype, 5. trailing newline + nv = len(r.split('\n')) == 6 # 1. , 2. Index, 3. Columns, 4. dtype, 5. memory usage, 6. trailing newline return has_info and nv def has_horizontally_truncated_repr(df): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7062a53bb5881..035a301807039 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6553,7 +6553,7 @@ def test_info_max_cols(self): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) for len_, verbose in [(10, None), (5, False), (10, True)]: @@ -6562,7 +6562,7 @@ def test_info_max_cols(self): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates @@ -6570,15 +6570,49 @@ def test_info_max_cols(self): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - self.assertEqual(len(res.split('\n')), len_) + self.assertEqual(len(res.strip().split('\n')), len_) + def test_info_memory_usage(self): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + self.assertTrue("memory usage: " in res[-1]) + # do not display memory usage cas + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + self.assertTrue("memory usage: " not in res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ['int64', 'int64', 'int64', 'float64'] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + # Ensure df size is as expected + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 # cols * rows * bytes + self.assertEqual(df_size, exp_size) + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) # index=False; default + self.assertEqual(size_df, np.size(df.memory_usage())) def test_dtypes(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0