From 2b1e5fc444bb3b7ed417dad4d084d28ce3ec5d37 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 7 Feb 2020 16:42:40 +0000 Subject: [PATCH 01/59] first draft --- pandas/core/generic.py | 298 ++++++++++++++++++++++++++++++- pandas/tests/series/test_repr.py | 20 +++ 2 files changed, 317 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 579daae2b15c6..a266f0102f539 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,6 +6,7 @@ import operator import pickle import re +import sys from textwrap import dedent from typing import ( TYPE_CHECKING, @@ -28,7 +29,7 @@ import numpy as np -from pandas._config import config +from pandas._config import config, get_option from pandas._libs import Timestamp, iNaT, lib from pandas._typing import ( @@ -125,6 +126,10 @@ ) +def _put_str(s, space): + return str(s)[:space].ljust(space) + + def _single_replace(self, to_replace, method, inplace, limit): """ Replaces values in a Series using the fill method specified when no @@ -1716,6 +1721,297 @@ def keys(self): """ return self._info_axis + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtype and column dtypes, non-null values and memory usage. + + Parameters + ---------- + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage : bool, str, optional + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the frame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a DataFrame and returns None. + + See Also + -------- + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. + + Examples + -------- + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB + """ + if buf is None: # pragma: no cover + buf = sys.stdout + + lines = [] + + lines.append(str(type(self))) + lines.append(self.index._summary()) + + if self._typ == "dataframe": + cols = self.columns + dtypes = self.dtypes + else: + cols = pd.Series([self.name]) + dtypes = pd.Series([self.dtypes]) + + col_count = len(cols) + + if col_count == 0: + lines.append(f"Empty {type(self).__name__}") + fmt.buffer_put_lines(buf, lines) + return + + # hack + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self) + 1) + + if null_counts is None: + show_counts = (col_count <= max_cols) and (len(self) < max_rows) + else: + show_counts = null_counts + exceeds_info_cols = col_count > max_cols + + def _verbose_repr(): + if self._typ == "dataframe": + lines.append(f"Data columns (total {col_count} columns):") + counts = self.count() + else: + lines.append(f"Series name: {self.name}") + counts = pd.Series([self.count()]) + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space + + header = _put_str(id_head, space_num) + if self._typ == "dataframe": + header += _put_str(column_head, space) + if show_counts: + if len(cols) != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + counts = None # can this be remove? + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) * (self._typ == "dataframe") + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + for i, col in enumerate(cols): + dtype = dtypes.iloc[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts.iloc[i] + + lines.append( + line_no + + _put_str(col, space) * (self._typ == "dataframe") + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + def _non_verbose_repr(): + lines.append(cols._summary(name="Columns")) + + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + if verbose: + _verbose_repr() + elif verbose is False: # specifically set to False, not nesc None + _non_verbose_repr() + else: + if exceeds_info_cols: + _non_verbose_repr() + else: + _verbose_repr() + + counts = self._data.get_dtype_counts() + dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(dtypes)}") + + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + if memory_usage: + # append memory usage of df to display + size_qualifier = "" + if memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.index._is_memory_usage_qualified(): + size_qualifier = "+" + if self._typ == "dataframe": + mem_usage = self.memory_usage(index=True, deep=deep).sum() + else: + mem_usage = self.memory_usage(index=True, deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(buf, lines) + def items(self): """ Iterate over (label, values) on info axis diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 64a8c4569406e..ee0676158c867 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -487,3 +487,23 @@ def test_categorical_series_repr_timedelta_ordered(self): 8 days 01:00:00 < 9 days 01:00:00]""" # noqa assert repr(s) == exp + + def test_info(self, capsys): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + s.info() + expected = """ +MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') +Series name: sth + # Non-Null Count Dtype +--- -------------- ----- + 0 10 non-null int64 +dtypes: int64(1) +memory usage: 505.0+ bytes +""" + result = capsys.readouterr().out + assert result == expected From a4ad077153c790f46a2e0d71203b550c653c4a7d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 7 Feb 2020 18:38:54 +0000 Subject: [PATCH 02/59] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 13827e8fc4c33..3347abcf299cb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,6 +44,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) - +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` - .. --------------------------------------------------------------------------- From c7bfb94cb403444d49c8818b301219b11520badf Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 7 Feb 2020 18:54:16 +0000 Subject: [PATCH 03/59] docstring sharing --- pandas/core/frame.py | 5 +++++ pandas/core/generic.py | 32 ++++++++++++++++++-------------- pandas/core/series.py | 5 +++++ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9fe1ec7b792c8..d93cef06996b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2223,6 +2223,11 @@ def to_html( encoding=encoding, ) + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ): + return super().info(verbose, buf, max_cols, memory_usage, null_counts) + # ---------------------------------------------------------------------- @Appender(info.__doc__) def info( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a266f0102f539..e1a93850868fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1721,14 +1721,23 @@ def keys(self): """ return self._info_axis + # @Substitution(klass=self.__name__, type_sub=" and column dtypes"*(self._typ + # =='dataframe'), + # max_cols_sub=""" + # max_cols : int, optional + # When to switch from the verbose to the truncated output. If the + # DataFrame has more than `max_cols` columns, the truncated output + # is used. By default, the setting in + # ``pandas.options.display.max_info_columns`` is used. + # """*(self._typ=='dataframe'), ) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ) -> None: """ - Print a concise summary of a DataFrame. + Print a concise summary of a %(klass)s. - This method prints information about a DataFrame including - the index dtype and column dtypes, non-null values and memory usage. + This method prints information about a %(klass)s including + the index dtype%(type_sub), non-null values and memory usage. Parameters ---------- @@ -1738,14 +1747,9 @@ def info( buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process - the output. - max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. + the output.%(max_cols_sub)s memory_usage : bool, str, optional - Specifies whether total memory usage of the DataFrame + Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. @@ -1759,7 +1763,7 @@ def info( at the cost of computational resources. null_counts : bool, optional Whether to show the non-null counts. By default, this is shown - only if the frame is smaller than + only if the %(klass)s is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always shows the counts, and False never shows the counts. @@ -1767,13 +1771,13 @@ def info( Returns ------- None - This method prints a summary of a DataFrame and returns None. + This method prints a summary of a %(klass)s and returns None. See Also -------- - DataFrame.describe: Generate descriptive statistics of DataFrame + DataFrame.describe: Generate descriptive statistics of %(klass)s columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. + %(klass)s.memory_usage: Memory usage of %(klass)s%(memory_sub)s. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 9c0ff9780da3e..2e19e3b8d7808 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4137,6 +4137,11 @@ def replace( method=method, ) + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ): + return super().info(verbose, buf, None, memory_usage, null_counts) + @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( From 01fd8028f5944c21e625fa7b70172d58f700fd7f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 7 Feb 2020 19:15:09 +0000 Subject: [PATCH 04/59] wip --- pandas/core/frame.py | 12 ++++++++++++ pandas/core/generic.py | 18 +++++------------- pandas/core/series.py | 2 ++ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d93cef06996b3..ba20edc2994cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2223,6 +2223,18 @@ def to_html( encoding=encoding, ) + @Substitution( + klass="DataFrame", + type_sub=" and columns", + max_cols_sub=""" +max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + """, + ) + @Appender(NDFrame.info.__doc__) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1a93850868fb..8d56fc111c4ad 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1721,15 +1721,6 @@ def keys(self): """ return self._info_axis - # @Substitution(klass=self.__name__, type_sub=" and column dtypes"*(self._typ - # =='dataframe'), - # max_cols_sub=""" - # max_cols : int, optional - # When to switch from the verbose to the truncated output. If the - # DataFrame has more than `max_cols` columns, the truncated output - # is used. By default, the setting in - # ``pandas.options.display.max_info_columns`` is used. - # """*(self._typ=='dataframe'), ) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ) -> None: @@ -1737,7 +1728,7 @@ def info( Print a concise summary of a %(klass)s. This method prints information about a %(klass)s including - the index dtype%(type_sub), non-null values and memory usage. + the index dtype%(type_sub)s, non-null values and memory usage. Parameters ---------- @@ -1747,7 +1738,8 @@ def info( buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process - the output.%(max_cols_sub)s + the output. + %(max_cols_sub)s memory_usage : bool, str, optional Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, @@ -1775,9 +1767,9 @@ def info( See Also -------- - DataFrame.describe: Generate descriptive statistics of %(klass)s + DataFrame.describe: Generate descriptive statistics of DataFrame columns. - %(klass)s.memory_usage: Memory usage of %(klass)s%(memory_sub)s. + DataFrame.memory_usage: Memory usage of DataFrame columns. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 2e19e3b8d7808..db42e856981c3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4137,6 +4137,8 @@ def replace( method=method, ) + @Substitution(klass="Series", type_sub="", max_cols_sub="") + @Appender(NDFrame.info.__doc__) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ): From abbae9a4c4e36f6b8b0387546919fbc2a50a205f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:03:10 +0000 Subject: [PATCH 05/59] add series tests --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 98 ++++++++- pandas/core/generic.py | 294 +-------------------------- pandas/core/series.py | 82 +++++++- pandas/io/formats/info.py | 173 +++++----------- pandas/tests/io/formats/test_info.py | 89 +++++++- 6 files changed, 313 insertions(+), 425 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3347abcf299cb..f75e4c71fa253 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,7 +44,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) - -- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba20edc2994cb..cb533e3d946ef 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2223,6 +2223,7 @@ def to_html( encoding=encoding, ) + # ---------------------------------------------------------------------- @Substitution( klass="DataFrame", type_sub=" and columns", @@ -2233,17 +2234,98 @@ def to_html( is used. By default, the setting in ``pandas.options.display.max_info_columns`` is used. """, + examples_sub=""" +>>> int_values = [1, 2, 3, 4, 5] +>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] +>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] +>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, +... "float_col": float_values}) +>>> df + int_col text_col float_col +0 1 alpha 0.00 +1 2 beta 0.25 +2 3 gamma 0.50 +3 4 delta 0.75 +4 5 epsilon 1.00 + +Prints information of all columns: + +>>> df.info(verbose=True) + +RangeIndex: 5 entries, 0 to 4 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 +dtypes: float64(1), int64(1), object(1) +memory usage: 248.0+ bytes + +Prints a summary of columns count and its dtypes but not per column +information: + +>>> df.info(verbose=False) + +RangeIndex: 5 entries, 0 to 4 +Columns: 3 entries, int_col to float_col +dtypes: float64(1), int64(1), object(1) +memory usage: 248.0+ bytes + +Pipe output of DataFrame.info to buffer instead of sys.stdout, get +buffer content and writes to a text file: + +>>> import io +>>> buffer = io.StringIO() +>>> df.info(buf=buffer) +>>> s = buffer.getvalue() +>>> with open("df_info.txt", "w", +... encoding="utf-8") as f: # doctest: +SKIP +... f.write(s) +260 +The `memory_usage` parameter allows deep introspection mode, specially +useful for big DataFrames and fine-tune memory optimization: +>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) +>>> df = pd.DataFrame({ +... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), +... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), +... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) +... }) +>>> df.info() + +RangeIndex: 1000000 entries, 0 to 999999 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object +dtypes: object(3) +memory usage: 22.9+ MB +>>> df.info(memory_usage='deep') + +RangeIndex: 1000000 entries, 0 to 999999 +Data columns (total 3 columns): + # Column Non-Null Count Dtype +--- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object +dtypes: object(3) +memory usage: 188.8 MB""", + see_also_sub=""" +DataFrame.describe: Generate descriptive statistics of DataFrame + columns. +DataFrame.memory_usage: Memory usage of DataFrame columns.""", ) - @Appender(NDFrame.info.__doc__) - def info( - self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): - return super().info(verbose, buf, max_cols, memory_usage, null_counts) - - # ---------------------------------------------------------------------- @Appender(info.__doc__) def info( - self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + self, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, ) -> None: return info(self, verbose, buf, max_cols, memory_usage, null_counts) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8d56fc111c4ad..579daae2b15c6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,6 @@ import operator import pickle import re -import sys from textwrap import dedent from typing import ( TYPE_CHECKING, @@ -29,7 +28,7 @@ import numpy as np -from pandas._config import config, get_option +from pandas._config import config from pandas._libs import Timestamp, iNaT, lib from pandas._typing import ( @@ -126,10 +125,6 @@ ) -def _put_str(s, space): - return str(s)[:space].ljust(space) - - def _single_replace(self, to_replace, method, inplace, limit): """ Replaces values in a Series using the fill method specified when no @@ -1721,293 +1716,6 @@ def keys(self): """ return self._info_axis - def info( - self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - DataFrame.describe: Generate descriptive statistics of DataFrame - columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. - - Examples - -------- - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, - ... "float_col": float_values}) - >>> df - int_col text_col float_col - 0 1 alpha 0.00 - 1 2 beta 0.25 - 2 3 gamma 0.50 - 3 4 delta 0.75 - 4 5 epsilon 1.00 - - Prints information of all columns: - - >>> df.info(verbose=True) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Prints a summary of columns count and its dtypes but not per column - information: - - >>> df.info(verbose=False) - - RangeIndex: 5 entries, 0 to 4 - Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Pipe output of DataFrame.info to buffer instead of sys.stdout, get - buffer content and writes to a text file: - - >>> import io - >>> buffer = io.StringIO() - >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: # doctest: +SKIP - ... f.write(s) - 260 - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big DataFrames and fine-tune memory optimization: - - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = pd.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) - ... }) - >>> df.info() - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 22.9+ MB - - >>> df.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 188.8 MB - """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(self))) - lines.append(self.index._summary()) - - if self._typ == "dataframe": - cols = self.columns - dtypes = self.dtypes - else: - cols = pd.Series([self.name]) - dtypes = pd.Series([self.dtypes]) - - col_count = len(cols) - - if col_count == 0: - lines.append(f"Empty {type(self).__name__}") - fmt.buffer_put_lines(buf, lines) - return - - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(self) + 1) - - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(self) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols - - def _verbose_repr(): - if self._typ == "dataframe": - lines.append(f"Data columns (total {col_count} columns):") - counts = self.count() - else: - lines.append(f"Series name: {self.name}") - counts = pd.Series([self.count()]) - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in cols) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - - header = _put_str(id_head, space_num) - if self._typ == "dataframe": - header += _put_str(column_head, space) - if show_counts: - if len(cols) != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - counts = None # can this be remove? - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype - ) - - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) * (self._typ == "dataframe") - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) - ) - - for i, col in enumerate(cols): - dtype = dtypes.iloc[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts.iloc[i] - - lines.append( - line_no - + _put_str(col, space) * (self._typ == "dataframe") - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) - - def _non_verbose_repr(): - lines.append(cols._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not nesc None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - counts = self._data.get_dtype_counts() - dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.index._is_memory_usage_qualified(): - size_qualifier = "+" - if self._typ == "dataframe": - mem_usage = self.memory_usage(index=True, deep=deep).sum() - else: - mem_usage = self.memory_usage(index=True, deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines) - def items(self): """ Iterate over (label, values) on info axis diff --git a/pandas/core/series.py b/pandas/core/series.py index db42e856981c3..6c4c7ab2b9692 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -15,6 +15,7 @@ Optional, Tuple, Type, + Union, ) import warnings @@ -92,6 +93,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt +from pandas.io.formats.info import info import pandas.plotting if TYPE_CHECKING: @@ -4137,12 +4139,82 @@ def replace( method=method, ) - @Substitution(klass="Series", type_sub="", max_cols_sub="") - @Appender(NDFrame.info.__doc__) + @Substitution( + klass="Series", + type_sub="", + max_cols_sub="", + examples_sub=""" +>>> int_values = [1, 2, 3, 4, 5] +>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] +>>> s = pd.Series(text_values, index=int_values) +>>> s.info() + +Int64Index: 5 entries, 1 to 5 +Series name: None + # Non-Null Count Dtype +--- -------------- ----- + 0 5 non-null object +dtypes: object(1) +memory usage: 80.0+ bytes + +Prints a summary excluding information about its values: + +>>> s.info(verbose=False) + +Int64Index: 5 entries, 1 to 5 +dtypes: object(1) +memory usage: 80.0+ bytes + +Pipe output of Series.info to buffer instead of sys.stdout, get +buffer content and writes to a text file: + +>>> import io +>>> buffer = io.StringIO() +>>> s.info(buf=buffer) +>>> s = buffer.getvalue() +>>> with open("df_info.txt", "w", +... encoding="utf-8") as f: # doctest: +SKIP +... f.write(s) +260 + +The `memory_usage` parameter allows deep introspection mode, specially +useful for big Series and fine-tune memory optimization: + +>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) +>>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) +>>> s.info() + +RangeIndex: 1000000 entries, 0 to 999999 +Series name: None + # Non-Null Count Dtype +--- -------------- ----- + 0 1000000 non-null object +dtypes: object(1) +memory usage: 7.6+ MB + +>>> s.info(memory_usage='deep') + +RangeIndex: 1000000 entries, 0 to 999999 +Series name: None + # Non-Null Count Dtype +--- -------------- ----- + 0 1000000 non-null object +dtypes: object(1) +memory usage: 62.9 MB""", + see_also_sub=""" +Series.describe: Generate descriptive statistics of Series. +Series.memory_usage: Memory usage of Series.""", + ) + @Appender(info.__doc__) def info( - self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): - return super().info(verbose, buf, None, memory_usage, null_counts) + self, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ) -> None: + return info(self, verbose, buf, None, memory_usage, null_counts) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0c08065f55273..623215ce3e873 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,28 +1,37 @@ import sys +from typing import IO, Optional, Union from pandas._config import get_option +from pandas._typing import FrameOrSeries + +from pandas.core.indexes.api import Index + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing -def _put_str(s, space): +def _put_str(s, space: int) -> str: + # todo type s return str(s)[:space].ljust(space) def info( - data, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, ) -> None: """ - Print a concise summary of a DataFrame. + Print a concise summary of a %(klass)s. - This method prints information about a DataFrame including - the index dtype and column dtypes, non-null values and memory usage. + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. Parameters ---------- - data : DataFrame - DataFrame to print information about. verbose : bool, optional Whether to print the full summary. By default, the setting in ``pandas.options.display.max_info_columns`` is followed. @@ -30,16 +39,11 @@ def info( Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process the output. - max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. + %(max_cols_sub)s memory_usage : bool, str, optional - Specifies whether total memory usage of the DataFrame + Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. - True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 @@ -50,7 +54,7 @@ def info( at the cost of computational resources. null_counts : bool, optional Whether to show the non-null counts. By default, this is shown - only if the frame is smaller than + only if the %(klass)s is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always shows the counts, and False never shows the counts. @@ -58,97 +62,15 @@ def info( Returns ------- None - This method prints a summary of a DataFrame and returns None. + This method prints a summary of a %(klass)s and returns None. See Also -------- - DataFrame.describe: Generate descriptive statistics of DataFrame - columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. + %(see_also_sub)s Examples -------- - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, - ... "float_col": float_values}) - >>> df - int_col text_col float_col - 0 1 alpha 0.00 - 1 2 beta 0.25 - 2 3 gamma 0.50 - 3 4 delta 0.75 - 4 5 epsilon 1.00 - - Prints information of all columns: - - >>> df.info(verbose=True) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Prints a summary of columns count and its dtypes but not per column - information: - - >>> df.info(verbose=False) - - RangeIndex: 5 entries, 0 to 4 - Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Pipe output of DataFrame.info to buffer instead of sys.stdout, get - buffer content and writes to a text file: - - >>> import io - >>> buffer = io.StringIO() - >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: # doctest: +SKIP - ... f.write(s) - 260 - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big DataFrames and fine-tune memory optimization: - - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = pd.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) - ... }) - >>> df.info() - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 22.9+ MB - - >>> df.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 188.8 MB + %(examples_sub)s """ if buf is None: # pragma: no cover buf = sys.stdout @@ -158,17 +80,23 @@ def info( lines.append(str(type(data))) lines.append(data.index._summary()) - if len(data.columns) == 0: + if data._typ == "dataframe": + cols = data.columns + dtypes = data.dtypes + else: + cols = Index([data.name]) + dtypes = Index([data.dtypes]) + + col_count = len(cols) + + if col_count == 0: lines.append(f"Empty {type(data).__name__}") fmt.buffer_put_lines(buf, lines) return - cols = data.columns - col_count = len(data.columns) - # hack if max_cols is None: - max_cols = get_option("display.max_info_columns", len(data.columns) + 1) + max_cols = get_option("display.max_info_columns", col_count + 1) max_rows = get_option("display.max_info_rows", len(data) + 1) @@ -179,7 +107,12 @@ def info( exceeds_info_cols = col_count > max_cols def _verbose_repr(): - lines.append(f"Data columns (total {len(data.columns)} columns):") + if data._typ == "dataframe": + lines.append(f"Data columns (total {col_count} columns):") + counts = data.count() + else: + lines.append(f"Series name: {data.name}") + counts = Index([data.count()]) id_head = " # " column_head = "Column" @@ -193,12 +126,13 @@ def _verbose_repr(): len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space - header = _put_str(id_head, space_num) + _put_str(column_head, space) + header = _put_str(id_head, space_num) + if data._typ == "dataframe": + header += _put_str(column_head, space) if show_counts: - counts = data.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( - f"Columns must equal counts ({len(cols)} != {len(counts)})" + f"Columns must equal counts ({col_count} != {len(counts)})" ) count_header = "Non-Null Count" len_count = len(count_header) @@ -207,6 +141,7 @@ def _verbose_repr(): space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: + counts = None # can this be remove? count_header = "" space_count = len(count_header) len_count = space_count @@ -214,7 +149,7 @@ def _verbose_repr(): dtype_header = "Dtype" len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in data.dtypes) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) header += _put_str(count_header, space_count) + _put_str( dtype_header, space_dtype @@ -223,29 +158,30 @@ def _verbose_repr(): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) + + _put_str("-" * len_column, space) * (data._typ == "dataframe") + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) - for i, col in enumerate(data.columns): - dtype = data.dtypes.iloc[i] + for i, col in enumerate(cols): + dtype = dtypes[i] col = pprint_thing(col) line_no = _put_str(f" {i}", space_num) count = "" if show_counts: - count = counts.iloc[i] + count = counts[i] lines.append( line_no - + _put_str(col, space) + + _put_str(col, space) * (data._typ == "dataframe") + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): - lines.append(data.columns._summary(name="Columns")) + if data._typ == "dataframe": + lines.append(cols._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): # returns size in human readable format @@ -283,6 +219,9 @@ def _sizeof_fmt(num, size_qualifier): deep = False if "object" in counts or data.index._is_memory_usage_qualified(): size_qualifier = "+" - mem_usage = data.memory_usage(index=True, deep=deep).sum() + if data._typ == "dataframe": + mem_usage = data.memory_usage(index=True, deep=deep).sum() + else: + mem_usage = data.memory_usage(index=True, deep=deep) lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 877bd1650ae60..9a41810a0af33 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -67,8 +67,19 @@ def test_info_categorical_column(): buf = StringIO() df2.info(buf=buf) + s = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) + + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) -def test_info(float_frame, datetime_frame): + +def test_info_frame(float_frame, datetime_frame): io = StringIO() float_frame.info(buf=io) datetime_frame.info(buf=io) @@ -79,6 +90,32 @@ def test_info(float_frame, datetime_frame): frame.info(verbose=False) +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(self, verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + expected = """ +MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') +""" + if verbose: + expected += """Series name: sth + # Non-Null Count Dtype +--- -------------- ----- + 0 10 non-null int64 +""" + expected += f"""dtypes: int64(1) +memory usage: {s.memory_usage()}.0+ bytes +""" + result = buf.getvalue() + assert result == expected + + def test_info_verbose(): buf = StringIO() size = 1001 @@ -320,6 +357,14 @@ def test_info_memory_usage_deep_not_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): @@ -332,6 +377,14 @@ def test_info_memory_usage_deep_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): @@ -373,6 +426,26 @@ def test_info_memory_usage_qualified(): df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + s = Series(1, index=[1, 2, 3]) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=list("ABC")) + s.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), range(3)]),) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]]),) + s.info(buf=buf) + assert "+" in buf.getvalue() + def test_info_memory_usage_bug_on_multiindex(): # GH 14308 @@ -395,6 +468,15 @@ def memory_usage(f): # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000 + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + assert unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) < 2000 + def test_info_categorical(): # GH14298 @@ -403,3 +485,8 @@ def test_info_categorical(): buf = StringIO() df.info(buf=buf) + + s = Series(np.zeros((2)), index=idx) + + buf = StringIO() + s.info(buf=buf) From 1a474fe7a3a351f5ecc6b29028713503d63ba7fa Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:31:41 +0000 Subject: [PATCH 06/59] formatting --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f75e4c71fa253..e2444f8a12d2f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -43,7 +43,6 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) -- - :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - From b30ce1bbbc3bf1d1aadb7f89038d513bc2e1c830 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:32:47 +0000 Subject: [PATCH 07/59] formatting --- pandas/core/frame.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb533e3d946ef..61ceae1b2d313 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2283,8 +2283,10 @@ def to_html( ... encoding="utf-8") as f: # doctest: +SKIP ... f.write(s) 260 + The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) >>> df = pd.DataFrame({ ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), @@ -2302,6 +2304,7 @@ def to_html( 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB + >>> df.info(memory_usage='deep') RangeIndex: 1000000 entries, 0 to 999999 From 6d8c7658fa38c47f8e86ae352563f5478391be28 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:35:00 +0000 Subject: [PATCH 08/59] remove old file --- pandas/tests/series/test_repr.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index ee0676158c867..64a8c4569406e 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -487,23 +487,3 @@ def test_categorical_series_repr_timedelta_ordered(self): 8 days 01:00:00 < 9 days 01:00:00]""" # noqa assert repr(s) == exp - - def test_info(self, capsys): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - s = Series(range(len(index)), index=index, name="sth") - s.info() - expected = """ -MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') -Series name: sth - # Non-Null Count Dtype ---- -------------- ----- - 0 10 non-null int64 -dtypes: int64(1) -memory usage: 505.0+ bytes -""" - result = capsys.readouterr().out - assert result == expected From 99411e4df06add539c9b764d22d4940d949acffb Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:42:02 +0000 Subject: [PATCH 09/59] clean --- pandas/io/formats/info.py | 6 ++---- pandas/tests/io/formats/test_info.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 623215ce3e873..0682628bee95b 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -3,7 +3,7 @@ from pandas._config import get_option -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, Dtype from pandas.core.indexes.api import Index @@ -11,8 +11,7 @@ from pandas.io.formats.printing import pprint_thing -def _put_str(s, space: int) -> str: - # todo type s +def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) @@ -141,7 +140,6 @@ def _verbose_repr(): space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: - counts = None # can this be remove? count_header = "" space_count = len(count_header) len_count = space_count diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 9a41810a0af33..136fa1c09949a 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -91,7 +91,7 @@ def test_info_frame(float_frame, datetime_frame): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(self, verbose): +def test_info_series(verbose): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], From 4651bd76e77631dedcd0e71d5e7caa38c48e29b0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 11:55:29 +0000 Subject: [PATCH 10/59] add test --- pandas/tests/io/formats/test_info.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 136fa1c09949a..7d22ac771ab90 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -159,6 +159,25 @@ def test_info_memory(): ) assert result == expected + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + # Non-Null Count Dtype + --- -------------- ----- + 0 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + def test_info_wide(): io = StringIO() From 7de47036157fb25217f93ab093301aa77cc21ba4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 12:09:38 +0000 Subject: [PATCH 11/59] add test --- pandas/tests/io/formats/test_info.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 7d22ac771ab90..415a7757fdae3 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -244,6 +244,14 @@ def test_info_shows_column_dtypes(): name = f" {i:d} {i:d} {n:d} non-null {dtype}" assert name in res + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f" 0 {n:d} non-null {dtype}" + assert name in res + def test_info_max_cols(): df = DataFrame(np.random.randn(10, 5)) From 99472fd5b8d73bcdd9b9ac36815b3206f4adad34 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 12:27:50 +0000 Subject: [PATCH 12/59] isort --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0682628bee95b..5e9ceea5429fc 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -3,7 +3,7 @@ from pandas._config import get_option -from pandas._typing import FrameOrSeries, Dtype +from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index From 2902fe7f7bdfb03fbe6aa5537f19d7f1cf26b7cd Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 13:09:44 +0000 Subject: [PATCH 13/59] remove test --- pandas/tests/series/test_api.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 33706c00c53f4..a4de7b3ece2f7 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -324,12 +324,6 @@ def test_items_strings(self, string_series): # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") - def test_raise_on_info(self): - s = Series(np.random.randn(10)) - msg = "'Series' object has no attribute 'info'" - with pytest.raises(AttributeError, match=msg): - s.info() - def test_copy(self): for deep in [None, False, True]: From c6d8a762b501dea901357f68e6402a56f5129ba7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 23 Feb 2020 12:09:53 +0000 Subject: [PATCH 14/59] use isinstance abcdataframe, disallow max_cols for series.info --- pandas/core/series.py | 5 +++++ pandas/io/formats/info.py | 14 +++++++------- pandas/tests/io/formats/test_info.py | 5 +++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7cb38f5869b95..f4fe259b85c49 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4216,6 +4216,11 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) return info(self, verbose, buf, None, memory_usage, null_counts) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 5e9ceea5429fc..37cdbfd47cfec 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -79,7 +79,7 @@ def info( lines.append(str(type(data))) lines.append(data.index._summary()) - if data._typ == "dataframe": + if isinstance(data, ABCDataFrame): cols = data.columns dtypes = data.dtypes else: @@ -106,7 +106,7 @@ def info( exceeds_info_cols = col_count > max_cols def _verbose_repr(): - if data._typ == "dataframe": + if isinstance(data, ABCDataFrame): lines.append(f"Data columns (total {col_count} columns):") counts = data.count() else: @@ -126,7 +126,7 @@ def _verbose_repr(): space_num = max(max_id, len_id) + col_space header = _put_str(id_head, space_num) - if data._typ == "dataframe": + if isinstance(data, ABCDataFrame): header += _put_str(column_head, space) if show_counts: if len(cols) != len(counts): # pragma: no cover @@ -156,7 +156,7 @@ def _verbose_repr(): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) * (data._typ == "dataframe") + + _put_str("-" * len_column, space) * isinstance(data, ABCDataFrame) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) @@ -172,13 +172,13 @@ def _verbose_repr(): lines.append( line_no - + _put_str(col, space) * (data._typ == "dataframe") + + _put_str(col, space) * isinstance(data, ABCDataFrame) + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): - if data._typ == "dataframe": + if isinstance(data, ABCDataFrame): lines.append(cols._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): @@ -217,7 +217,7 @@ def _sizeof_fmt(num, size_qualifier): deep = False if "object" in counts or data.index._is_memory_usage_qualified(): size_qualifier = "+" - if data._typ == "dataframe": + if isinstance(data, ABCDataFrame): mem_usage = data.memory_usage(index=True, deep=deep).sum() else: mem_usage = data.memory_usage(index=True, deep=deep) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 415a7757fdae3..f5cf3b9253d0f 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -196,6 +196,11 @@ def test_info_wide(): assert rs == xp reset_option("display.max_info_columns") + s = Series(np.random.randn(101)) + msg = ("Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info") + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) def test_info_duplicate_columns(): io = StringIO() From d0b2e1f82fbeb2d139fb778b1494c4097aee8f6e Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 23 Feb 2020 13:22:19 +0000 Subject: [PATCH 15/59] refactor --- pandas/core/series.py | 2 +- pandas/io/formats/info.py | 84 ++++++++++++++++------------ pandas/tests/io/formats/test_info.py | 4 +- 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index f4fe259b85c49..a0aa34714ff30 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4220,7 +4220,7 @@ def info( raise ValueError( "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" - ) + ) return info(self, verbose, buf, None, memory_usage, null_counts) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 37cdbfd47cfec..443b08e653d6a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -6,6 +6,7 @@ from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index +from pandas.core.dtypes.generic import ABCDataFrame from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing @@ -79,57 +80,61 @@ def info( lines.append(str(type(data))) lines.append(data.index._summary()) + max_rows = get_option("display.max_info_rows", len(data) + 1) + if isinstance(data, ABCDataFrame): - cols = data.columns + ids = data.columns dtypes = data.dtypes - else: - cols = Index([data.name]) - dtypes = Index([data.dtypes]) - - col_count = len(cols) + col_count = len(ids) - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return + if col_count == 0: + lines.append(f"Empty {type(data).__name__}") + fmt.buffer_put_lines(buf, lines) + return - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) + # hack + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) - max_rows = get_option("display.max_info_rows", len(data) + 1) + if null_counts is None: + show_counts = (col_count <= max_cols) and (len(data) < max_rows) + else: + show_counts = null_counts + exceeds_info_cols = col_count > max_cols - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols + ids = Index([data.name]) + dtypes = Index([data.dtypes]) + exceeds_info_cols = False + show_counts = True def _verbose_repr(): + + id_head = " # " + id_space = 2 + len_id = len(pprint_thing(id_head)) + if isinstance(data, ABCDataFrame): + column_head = "Column" lines.append(f"Data columns (total {col_count} columns):") counts = data.count() + max_id = len(pprint_thing(col_count)) + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + id_space + space_num = max(max_id, len_id) + id_space + column_string = _put_str("-" * len_column, space) else: lines.append(f"Series name: {data.name}") counts = Index([data.count()]) - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in cols) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space + space_num = len_id + id_space + column_string = "" header = _put_str(id_head, space_num) if isinstance(data, ABCDataFrame): header += _put_str(column_head, space) if show_counts: - if len(cols) != len(counts): # pragma: no cover + if len(ids) != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" ) @@ -137,7 +142,7 @@ def _verbose_repr(): len_count = len(count_header) non_null = " non-null" max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space + space_count = max(len_count, max_count) + id_space count_temp = "{count}" + non_null else: count_header = "" @@ -156,30 +161,35 @@ def _verbose_repr(): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) * isinstance(data, ABCDataFrame) + + column_string + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) - for i, col in enumerate(cols): + for i, id_ in enumerate(ids): dtype = dtypes[i] - col = pprint_thing(col) + id_ = pprint_thing(id_) line_no = _put_str(f" {i}", space_num) count = "" if show_counts: count = counts[i] + if isinstance(data, ABCDataFrame): + column_string = _put_str(id_, space) + else: + column_string = "" + lines.append( line_no - + _put_str(col, space) * isinstance(data, ABCDataFrame) + + column_string + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): if isinstance(data, ABCDataFrame): - lines.append(cols._summary(name="Columns")) + lines.append(ids._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): # returns size in human readable format diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index f5cf3b9253d0f..94c08bb0c734b 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -197,11 +197,11 @@ def test_info_wide(): reset_option("display.max_info_columns") s = Series(np.random.randn(101)) - msg = ("Argument `max_cols` can only be passed " - "in DataFrame.info, not Series.info") + msg = "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" with pytest.raises(ValueError, match=msg): s.info(max_cols=1) + def test_info_duplicate_columns(): io = StringIO() From 222581040f39a2be23eac59d20511441a40e4fdd Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 23 Feb 2020 14:44:35 +0000 Subject: [PATCH 16/59] aint no autoformatter gonna unnecessarily split my strings --- pandas/tests/io/formats/test_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 94c08bb0c734b..049c339665b72 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -197,7 +197,7 @@ def test_info_wide(): reset_option("display.max_info_columns") s = Series(np.random.randn(101)) - msg = "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" with pytest.raises(ValueError, match=msg): s.info(max_cols=1) From 8c6c6f54d4343e1be50ff194541deb226f46e651 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 23 Feb 2020 15:00:47 +0000 Subject: [PATCH 17/59] isort --- pandas/io/formats/info.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 443b08e653d6a..6ac15c852d864 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -5,9 +5,10 @@ from pandas._typing import Dtype, FrameOrSeries -from pandas.core.indexes.api import Index from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.indexes.api import Index + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing From 127f84f0074224643d1902f55d7541063e69be5d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 18 Apr 2020 11:13:45 +0100 Subject: [PATCH 18/59] fix failing tests due to refactoring, merge conflicts --- pandas/io/formats/info.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 5d684b11df84c..b94327d17f817 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -211,7 +211,10 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() # groupby dtype.name to collect e.g. Categorical columns - counts = data.dtypes.value_counts().groupby(lambda x: x.name).sum() + if isinstance(data, ABCDataFrame): + counts = data.dtypes.value_counts().groupby(lambda x: x.name).sum() + else: + counts = {data.dtype.name: 1} dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] lines.append(f"dtypes: {', '.join(dtypes)}") From 965419800fc611cd82f6d68fb20129bf18bad56b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 11:40:57 +0100 Subject: [PATCH 19/59] resolve conflicts --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b94327d17f817..69ff701918c95 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -85,8 +85,8 @@ def info( if isinstance(data, ABCDataFrame): ids = data.columns - dtypes = data.dtypes col_count = len(ids) + dtypes = data.dtypes if col_count == 0: lines.append(f"Empty {type(data).__name__}") @@ -212,7 +212,7 @@ def _sizeof_fmt(num, size_qualifier): # groupby dtype.name to collect e.g. Categorical columns if isinstance(data, ABCDataFrame): - counts = data.dtypes.value_counts().groupby(lambda x: x.name).sum() + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() else: counts = {data.dtype.name: 1} dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] From c1006a72aceca4876d2e2c10f38872b129508dc9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 20 Apr 2020 11:46:31 +0100 Subject: [PATCH 20/59] replace appender with doc --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 232d3b605253b..64f49d7b885ca 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4280,7 +4280,7 @@ def replace( Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""", ) - @Appender(info.__doc__) + @doc(info) def info( self, verbose: Optional[bool] = None, From 3592e8ebc7852b78d1831c8780ed052d4c90141a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 22 Apr 2020 18:56:07 +0100 Subject: [PATCH 21/59] indent series.info subs --- pandas/core/series.py | 126 ++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 61 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 111549488d388..3eb4750b6aea7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4218,67 +4218,71 @@ def replace( klass="Series", type_sub="", max_cols_sub="", - examples_sub=""" ->>> int_values = [1, 2, 3, 4, 5] ->>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] ->>> s = pd.Series(text_values, index=int_values) ->>> s.info() - -Int64Index: 5 entries, 1 to 5 -Series name: None - # Non-Null Count Dtype ---- -------------- ----- - 0 5 non-null object -dtypes: object(1) -memory usage: 80.0+ bytes - -Prints a summary excluding information about its values: - ->>> s.info(verbose=False) - -Int64Index: 5 entries, 1 to 5 -dtypes: object(1) -memory usage: 80.0+ bytes - -Pipe output of Series.info to buffer instead of sys.stdout, get -buffer content and writes to a text file: - ->>> import io ->>> buffer = io.StringIO() ->>> s.info(buf=buffer) ->>> s = buffer.getvalue() ->>> with open("df_info.txt", "w", -... encoding="utf-8") as f: # doctest: +SKIP -... f.write(s) -260 - -The `memory_usage` parameter allows deep introspection mode, specially -useful for big Series and fine-tune memory optimization: - ->>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) ->>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) ->>> s.info() - -RangeIndex: 1000000 entries, 0 to 999999 -Series name: None - # Non-Null Count Dtype ---- -------------- ----- - 0 1000000 non-null object -dtypes: object(1) -memory usage: 7.6+ MB - ->>> s.info(memory_usage='deep') - -RangeIndex: 1000000 entries, 0 to 999999 -Series name: None - # Non-Null Count Dtype ---- -------------- ----- - 0 1000000 non-null object -dtypes: object(1) -memory usage: 62.9 MB""", - see_also_sub=""" -Series.describe: Generate descriptive statistics of Series. -Series.memory_usage: Memory usage of Series.""", + examples_sub=( + """ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Int64Index: 5 entries, 1 to 5 + Series name: None + # Non-Null Count Dtype + --- -------------- ----- + 0 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Int64Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + # Non-Null Count Dtype + --- -------------- ----- + 0 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + # Non-Null Count Dtype + --- -------------- ----- + 0 1000000 non-null object + dtypes: object(1) + memory usage: 62.9 MB""" + ), + see_also_sub=( + """ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" + ), ) @doc(info) def info( From af771e694e28de0d06faa7d3797787801b563c2a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 22 Apr 2020 18:59:17 +0100 Subject: [PATCH 22/59] revert deleted line --- pandas/io/formats/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a95b8b60d36d6..7cf842d20e361 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -47,6 +47,7 @@ def info( Specifies whether total memory usage of the %(klass)s elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. + True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 From 317a14836cbf96087124b617ec0cd6b9921c4a01 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 22 Apr 2020 19:18:06 +0100 Subject: [PATCH 23/59] fix indentation in doctests --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3eb4750b6aea7..872c2a3aa233c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4262,9 +4262,9 @@ def replace( RangeIndex: 1000000 entries, 0 to 999999 Series name: None - # Non-Null Count Dtype + # Non-Null Count Dtype --- -------------- ----- - 0 1000000 non-null object + 0 1000000 non-null object dtypes: object(1) memory usage: 7.6+ MB @@ -4272,9 +4272,9 @@ def replace( RangeIndex: 1000000 entries, 0 to 999999 Series name: None - # Non-Null Count Dtype + # Non-Null Count Dtype --- -------------- ----- - 0 1000000 non-null object + 0 1000000 non-null object dtypes: object(1) memory usage: 62.9 MB""" ), From ae0065bc180ea55535d67019ce5355f187a34877 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 12 May 2020 20:48:27 +0100 Subject: [PATCH 24/59] reuse col_count --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c8f05e246f5d3..be84d93195483 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -185,7 +185,7 @@ def _verbose_repr(): if isinstance(data, ABCDataFrame): header += _put_str(column_head, space) if show_counts: - if len(ids) != len(counts): # pragma: no cover + if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" ) From c36d4c40516e054224ffe630343765b7d351c224 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 12 May 2020 21:05:01 +0100 Subject: [PATCH 25/59] reorder to reduce diff size --- pandas/io/formats/info.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index be84d93195483..2563b386b80c9 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -134,29 +134,27 @@ def info( lines.append(str(type(data))) lines.append(data.index._summary()) - max_rows = get_option("display.max_info_rows", len(data) + 1) ids, dtypes = _get_ids_and_dtypes(data) col_count = len(ids) - if isinstance(data, ABCDataFrame): - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return + if col_count == 0 and isinstance(data, ABCDataFrame): + lines.append(f"Empty {type(data).__name__}") + fmt.buffer_put_lines(buf, lines) + return - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) + # hack + if max_cols is None and isinstance(data, ABCDataFrame): + max_cols = get_option("display.max_info_columns", col_count + 1) - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols + max_rows = get_option("display.max_info_rows", len(data) + 1) + if null_counts is None and isinstance(data, ABCDataFrame): + show_counts = (col_count <= max_cols) and (len(data) < max_rows) + elif isinstance(data, ABCDataFrame): + show_counts = null_counts else: - exceeds_info_cols = False show_counts = True + exceeds_info_cols = isinstance(data, ABCDataFrame) and col_count > max_cols def _verbose_repr(): From 751d34678b8d75811917919c9c775f48dd78863e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 16 May 2020 19:19:02 +0100 Subject: [PATCH 26/59] help mypy --- pandas/io/formats/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 2563b386b80c9..601fa370fc532 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -147,6 +147,7 @@ def info( max_cols = get_option("display.max_info_columns", col_count + 1) max_rows = get_option("display.max_info_rows", len(data) + 1) + assert max_cols is not None # help mypy if null_counts is None and isinstance(data, ABCDataFrame): show_counts = (col_count <= max_cols) and (len(data) < max_rows) From 631d914dab23c24ef5fbdcc061127f3599fda55e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 16 May 2020 19:49:16 +0100 Subject: [PATCH 27/59] aftermentioned 'help' should only be applied for DataFrame case --- pandas/io/formats/info.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 601fa370fc532..42b47bc03ed7a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -147,15 +147,20 @@ def info( max_cols = get_option("display.max_info_columns", col_count + 1) max_rows = get_option("display.max_info_rows", len(data) + 1) - assert max_cols is not None # help mypy if null_counts is None and isinstance(data, ABCDataFrame): + assert max_cols is not None # help mypy show_counts = (col_count <= max_cols) and (len(data) < max_rows) elif isinstance(data, ABCDataFrame): show_counts = null_counts else: show_counts = True - exceeds_info_cols = isinstance(data, ABCDataFrame) and col_count > max_cols + + if isinstance(data, ABCDataFrame): + assert max_cols is not None # help mypy + exceeds_info_cols = col_count > max_cols + else: + exceeds_info_cols = False def _verbose_repr(): From 23bd173cfe4d396725f85c1c83823c76c93c453c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 16 May 2020 21:17:27 +0100 Subject: [PATCH 28/59] add docstring to _get_ids_and_dtypes --- pandas/io/formats/info.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 42b47bc03ed7a..250778a53b3e7 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -44,19 +44,19 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: """ - Get DataFrame's columns and dtypes. + Get DataFrame's columns (or Series' name) and dtypes. Parameters ---------- - data : DataFrame + data : DataFrame or Series Object that `info` was called on. Returns ------- ids : Index - DataFrame's columns. + DataFrame's columns or Series' name. dtypes : Series - Dtype of each of the DataFrame's columns. + Dtype of each of the DataFrame's columns or the Series' dtype. """ if isinstance(data, ABCDataFrame): ids = data.columns From 304f44592f0388b547f77c1eff62a7b73f015425 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 19 May 2020 19:03:56 +0100 Subject: [PATCH 29/59] correct return type of _get_ids_and_dtypes, as in Series case dtypes is an Index --- pandas/io/formats/info.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 250778a53b3e7..758489533edc9 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,7 +42,9 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: +def _get_ids_and_dtypes( + data: FrameOrSeries, +) -> Tuple["Index", Union["Series", "Index"]]: """ Get DataFrame's columns (or Series' name) and dtypes. From a2d6e43c8b6384050d5411e9a19dd1f67e762005 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 19 May 2020 19:10:31 +0100 Subject: [PATCH 30/59] return Series for dtypes in all cases --- pandas/io/formats/info.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 758489533edc9..b4cafba2401b5 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,9 +42,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes( - data: FrameOrSeries, -) -> Tuple["Index", Union["Series", "Index"]]: +def _get_ids_and_dtypes(data: FrameOrSeries,) -> Tuple["Index", "Series"]: """ Get DataFrame's columns (or Series' name) and dtypes. @@ -65,7 +63,7 @@ def _get_ids_and_dtypes( dtypes = data.dtypes else: ids = Index([data.name]) - dtypes = Index([data.dtypes]) + dtypes = data._constructor(data.dtypes) return ids, dtypes From f33f0df30f151a381d663ae631c204cdc1bbb06a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 19 May 2020 19:11:53 +0100 Subject: [PATCH 31/59] black bug --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b4cafba2401b5..1ee176271d1c0 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,7 +42,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes(data: FrameOrSeries,) -> Tuple["Index", "Series"]: +def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: """ Get DataFrame's columns (or Series' name) and dtypes. From 22de3c5ea811db29254bae755ee4754b95266a06 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 30 May 2020 13:41:44 +0100 Subject: [PATCH 32/59] reduce if/then --- pandas/io/formats/info.py | 84 ++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 1ee176271d1c0..e8f70e743c8c6 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,9 +42,30 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) +def _get_counts(data: FrameOrSeries) -> "Series": + """ + Get DataFrame or Series' counts. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + + Returns + ------- + counts : Series + Count non-NA cells (for each column in the DataFrame case). + """ + if isinstance(data, ABCDataFrame): + counts = data.count() + else: + counts = data._constructor(data.count()) + return counts + + def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: """ - Get DataFrame's columns (or Series' name) and dtypes. + Get DataFrame or Series' columns/name and dtypes. Parameters ---------- @@ -137,58 +158,49 @@ def info( ids, dtypes = _get_ids_and_dtypes(data) col_count = len(ids) - if col_count == 0 and isinstance(data, ABCDataFrame): + if col_count == 0: lines.append(f"Empty {type(data).__name__}") fmt.buffer_put_lines(buf, lines) return # hack - if max_cols is None and isinstance(data, ABCDataFrame): + if max_cols is None: max_cols = get_option("display.max_info_columns", col_count + 1) max_rows = get_option("display.max_info_rows", len(data) + 1) - if null_counts is None and isinstance(data, ABCDataFrame): - assert max_cols is not None # help mypy + if null_counts is None: show_counts = (col_count <= max_cols) and (len(data) < max_rows) - elif isinstance(data, ABCDataFrame): - show_counts = null_counts else: - show_counts = True - - if isinstance(data, ABCDataFrame): - assert max_cols is not None # help mypy - exceeds_info_cols = col_count > max_cols - else: - exceeds_info_cols = False + show_counts = null_counts + exceeds_info_cols = col_count > max_cols def _verbose_repr(): id_head = " # " - id_space = 2 + column_head = "Column" + col_space = 2 + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) + + max_id = len(pprint_thing(col_count)) len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space + + header = _put_str(id_head, space_num) if isinstance(data, ABCDataFrame): - column_head = "Column" lines.append(f"Data columns (total {col_count} columns):") - counts = data.count() - max_id = len(pprint_thing(col_count)) - max_col = max(len(pprint_thing(k)) for k in ids) len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + id_space - space_num = max(max_id, len_id) + id_space - column_string = _put_str("-" * len_column, space) + space = max(max_col, len_column) + col_space + header += _put_str(column_head, space) else: lines.append(f"Series name: {data.name}") - counts = Index([data.count()]) - space_num = len_id + id_space - column_string = "" + space = 0 - header = _put_str(id_head, space_num) - if isinstance(data, ABCDataFrame): - header += _put_str(column_head, space) if show_counts: + counts = _get_counts(data) if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" @@ -197,7 +209,7 @@ def _verbose_repr(): len_count = len(count_header) non_null = " non-null" max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + id_space + space_count = max(len_count, max_count) + col_space count_temp = "{count}" + non_null else: count_header = "" @@ -216,7 +228,7 @@ def _verbose_repr(): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + column_string + + _put_str("-" * len_column, space) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) @@ -230,14 +242,9 @@ def _verbose_repr(): if show_counts: count = counts[i] - if isinstance(data, ABCDataFrame): - column_string = _put_str(id_, space) - else: - column_string = "" - lines.append( line_no - + column_string + + _put_str(id_, space) + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) @@ -265,10 +272,7 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() # groupby dtype.name to collect e.g. Categorical columns - if isinstance(data, ABCDataFrame): - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - else: - counts = {data.dtype.name: 1} + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] lines.append(f"dtypes: {', '.join(collected_dtypes)}") From 8a58bd61e3fbdf58a7759a1eb4785ac21900eef7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 30 May 2020 13:42:57 +0100 Subject: [PATCH 33/59] simplify diff --- pandas/io/formats/info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e8f70e743c8c6..1477031384aa2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -233,9 +233,9 @@ def _verbose_repr(): + _put_str("-" * len_dtype, space_dtype) ) - for i, id_ in enumerate(ids): + for i, col in enumerate(ids): dtype = dtypes[i] - id_ = pprint_thing(id_) + col = pprint_thing(col) line_no = _put_str(f" {i}", space_num) count = "" @@ -244,7 +244,7 @@ def _verbose_repr(): lines.append( line_no - + _put_str(id_, space) + + _put_str(col, space) + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) From 21d263c139f8a18ca7c2598c0a302f22c3fa83bd Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 30 May 2020 13:52:19 +0100 Subject: [PATCH 34/59] factor out memory usage --- pandas/io/formats/info.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 1477031384aa2..3e046acafdee8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,6 +42,31 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) +def _get_mem_usage(data: FrameOrSeries, deep: bool) -> int: + """ + Get DataFrame or Series' memory usage in bytes. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + if isinstance(data, ABCDataFrame): + mem_usage = data.memory_usage(index=True, deep=deep).sum() + else: + mem_usage = data.memory_usage(index=True, deep=deep) + return mem_usage + + def _get_counts(data: FrameOrSeries) -> "Series": """ Get DataFrame or Series' counts. @@ -290,9 +315,6 @@ def _sizeof_fmt(num, size_qualifier): deep = False if "object" in counts or data.index._is_memory_usage_qualified(): size_qualifier = "+" - if isinstance(data, ABCDataFrame): - mem_usage = data.memory_usage(index=True, deep=deep).sum() - else: - mem_usage = data.memory_usage(index=True, deep=deep) + mem_usage = _get_mem_usage(data, deep=deep) lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines) From cfa80395bad89387d59b8b9dd43db20a857c6daa Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 30 May 2020 14:05:20 +0100 Subject: [PATCH 35/59] clarify docstring --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 3e046acafdee8..a0867d19a6cf7 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -69,7 +69,7 @@ def _get_mem_usage(data: FrameOrSeries, deep: bool) -> int: def _get_counts(data: FrameOrSeries) -> "Series": """ - Get DataFrame or Series' counts. + Get DataFrame or Series' non-NA counts. Parameters ---------- From 3811545105293bec6fd45e2325477c9a14c9d907 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 6 Jun 2020 11:09:27 +0100 Subject: [PATCH 36/59] initial OOP approach --- pandas/core/frame.py | 8 +- pandas/core/series.py | 8 +- pandas/io/formats/info.py | 583 +++++++++++++++++++++----------------- 3 files changed, 333 insertions(+), 266 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d181e826c2a9..0c6289e164509 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -140,7 +140,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import info +from pandas.io.formats.info import DataFrameInfo, Info import pandas.plotting if TYPE_CHECKING: @@ -2510,7 +2510,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(info) + @doc(Info) def info( self, verbose: Optional[bool] = None, @@ -2519,7 +2519,9 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return info(self, verbose, buf, max_cols, memory_usage, null_counts) + return DataFrameInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).get_info() def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 8026dab4e946b..384cb5df95e0b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -88,7 +88,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt -from pandas.io.formats.info import info +from pandas.io.formats.info import Info, SeriesInfo import pandas.plotting if TYPE_CHECKING: @@ -4405,7 +4405,7 @@ def replace( Series.memory_usage: Memory usage of Series.""" ), ) - @doc(info) + @doc(Info) def info( self, verbose: Optional[bool] = None, @@ -4419,7 +4419,9 @@ def info( "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" ) - return info(self, verbose, buf, None, memory_usage, null_counts) + return SeriesInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).get_info() @doc(NDFrame.shift, **_shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a0867d19a6cf7..dc15b1fc4e2c9 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -42,279 +42,342 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_mem_usage(data: FrameOrSeries, deep: bool) -> int: - """ - Get DataFrame or Series' memory usage in bytes. - - Parameters - ---------- - data : DataFrame or Series - Object that `info` was called on. - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. - - Returns - ------- - mem_usage : int - Object's total memory usage in bytes. - """ - if isinstance(data, ABCDataFrame): - mem_usage = data.memory_usage(index=True, deep=deep).sum() - else: - mem_usage = data.memory_usage(index=True, deep=deep) - return mem_usage - - -def _get_counts(data: FrameOrSeries) -> "Series": - """ - Get DataFrame or Series' non-NA counts. - - Parameters - ---------- - data : DataFrame or Series - Object that `info` was called on. - - Returns - ------- - counts : Series - Count non-NA cells (for each column in the DataFrame case). - """ - if isinstance(data, ABCDataFrame): - counts = data.count() - else: - counts = data._constructor(data.count()) - return counts - - -def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: - """ - Get DataFrame or Series' columns/name and dtypes. - - Parameters - ---------- - data : DataFrame or Series - Object that `info` was called on. - - Returns - ------- - ids : Index - DataFrame's columns or Series' name. - dtypes : Series - Dtype of each of the DataFrame's columns or the Series' dtype. - """ - if isinstance(data, ABCDataFrame): - ids = data.columns - dtypes = data.dtypes - else: - ids = Index([data.name]) - dtypes = data._constructor(data.dtypes) - return ids, dtypes - - -def info( - data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, - memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, -) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s - - Examples - -------- - %(examples_sub)s - """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(data))) - lines.append(data.index._summary()) +class Info: + def __init__( + self, + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ): + if buf is None: # pragma: no cover + buf = sys.stdout + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + + self.data = data + self.verbose = verbose + self.buf = buf + self.max_cols = max_cols + self.memory_usage = memory_usage + self.null_counts = null_counts + + def get_info(self) -> None: + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ + # move this to init + + lines = [] + + lines.append(str(type(self.data))) + lines.append(self.data.index._summary()) + + ids, dtypes = self._get_ids_and_dtypes() + col_count = len(ids) + + if col_count == 0: + lines.append(f"Empty {type(self.data).__name__}") + fmt.buffer_put_lines(self.buf, lines) + return + + # hack + max_cols = self.max_cols + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self.data) + 1) + + if self.null_counts is None: + show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + else: + show_counts = self.null_counts + exceeds_info_cols = col_count > max_cols - ids, dtypes = _get_ids_and_dtypes(data) - col_count = len(ids) + def _verbose_repr(): - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return + id_head = " # " + column_head = "Column" + col_space = 2 - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) - max_rows = get_option("display.max_info_rows", len(data) + 1) + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols + header = _put_str(id_head, space_num) - def _verbose_repr(): + if isinstance(self.data, ABCDataFrame): + lines.append(f"Data columns (total {col_count} columns):") + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + header += _put_str(column_head, space) + else: + lines.append(f"Series name: {self.data.name}") + space = 0 - id_head = " # " - column_head = "Column" - col_space = 2 + if show_counts: + counts = self._get_counts() + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space + for i, col in enumerate(ids): + dtype = dtypes[i] + col = pprint_thing(col) - header = _put_str(id_head, space_num) + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts[i] - if isinstance(data, ABCDataFrame): - lines.append(f"Data columns (total {col_count} columns):") - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - header += _put_str(column_head, space) - else: - lines.append(f"Series name: {data.name}") - space = 0 - - if show_counts: - counts = _get_counts(data) - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype - ) - - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) - ) - - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) + def _non_verbose_repr(): + if isinstance(self.data, ABCDataFrame): + lines.append(ids._summary(name="Columns")) - def _non_verbose_repr(): - if isinstance(data, ABCDataFrame): - lines.append(ids._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not necessarily None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True + if self.verbose: + _verbose_repr() + elif self.verbose is False: # specifically set to False, not necessarily None + _non_verbose_repr() else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = _get_mem_usage(data, deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines) + if exceeds_info_cols: + _non_verbose_repr() + else: + _verbose_repr() + + # groupby dtype.name to collect e.g. Categorical columns + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() + collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + if self.memory_usage: + # append memory usage of df to display + size_qualifier = "" + if self.memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self._get_mem_usage(deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(self.buf, lines) + + def _get_mem_usage(self, deep: bool) -> int: + raise NotImplementedError + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + raise NotImplementedError + + def _get_counts(self) -> "Series": + raise NotImplementedError + + +class DataFrameInfo(Info): + def _get_mem_usage(self, deep: bool) -> int: + """ + Get DataFrame or Series' memory usage in bytes. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + return self.data.memory_usage(index=True, deep=deep).sum() + + def _get_counts(self) -> "Series": + """ + Get DataFrame or Series' non-NA counts. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + + Returns + ------- + counts : Series + Count non-NA cells (for each column in the DataFrame case). + """ + return self.data.count() + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get DataFrame or Series' columns/name and dtypes. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + + Returns + ------- + ids : Index + DataFrame's columns or Series' name. + dtypes : Series + Dtype of each of the DataFrame's columns or the Series' dtype. + """ + return self.data.columns, self.data.dtypes + + +class SeriesInfo(Info): + def _get_mem_usage(self, deep: bool) -> int: + """ + Get DataFrame or Series' memory usage in bytes. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + return self.data.memory_usage(index=True, deep=deep) + + def _get_counts(self) -> "Series": + """ + Get DataFrame or Series' non-NA counts. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + + Returns + ------- + counts : Series + Count non-NA cells (for each column in the DataFrame case). + """ + return self.data._constructor(self.data.count()) + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get DataFrame or Series' columns/name and dtypes. + + Parameters + ---------- + data : DataFrame or Series + Object that `info` was called on. + + Returns + ------- + ids : Index + DataFrame's columns or Series' name. + dtypes : Series + Dtype of each of the DataFrame's columns or the Series' dtype. + """ + return Index([self.data.name]), self.data._constructor(self.data.dtypes) From 6bcbef79fda99d8b4a1027c742568219c418dcef Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 6 Jun 2020 12:51:11 +0100 Subject: [PATCH 37/59] space method --- pandas/io/formats/info.py | 124 ++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index dc15b1fc4e2c9..6950f20148bae 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -43,6 +43,58 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: class Info: + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ + def __init__( self, data: FrameOrSeries, @@ -65,59 +117,6 @@ def __init__( self.null_counts = null_counts def get_info(self) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s - - Examples - -------- - %(examples_sub)s - """ - # move this to init - lines = [] lines.append(str(type(self.data))) @@ -162,11 +161,11 @@ def _verbose_repr(): if isinstance(self.data, ABCDataFrame): lines.append(f"Data columns (total {col_count} columns):") len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - header += _put_str(column_head, space) + header += _put_str( + column_head, self.space(max_col, len_column, col_space) + ) else: lines.append(f"Series name: {self.data.name}") - space = 0 if show_counts: counts = self._get_counts() @@ -197,7 +196,7 @@ def _verbose_repr(): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) + + _put_str("-" * len_column, self.space(max_col, len_column, col_space)) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) @@ -213,7 +212,7 @@ def _verbose_repr(): lines.append( line_no - + _put_str(col, space) + + _put_str(col, self.space(max_col, len_column, col_space)) + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) @@ -270,6 +269,9 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: def _get_counts(self) -> "Series": raise NotImplementedError + def space(self, max_col, len_column, col_space): + raise NotImplementedError + class DataFrameInfo(Info): def _get_mem_usage(self, deep: bool) -> int: @@ -326,6 +328,9 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: """ return self.data.columns, self.data.dtypes + def space(self, max_col, len_column, col_space): + return max(max_col, len_column) + col_space + class SeriesInfo(Info): def _get_mem_usage(self, deep: bool) -> int: @@ -381,3 +386,6 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: Dtype of each of the DataFrame's columns or the Series' dtype. """ return Index([self.data.name]), self.data._constructor(self.data.dtypes) + + def space(self, max_col, len_column, col_space): + return 0 From a2454846bc4559a82f130106e16f5f7368827713 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 6 Jun 2020 13:22:21 +0100 Subject: [PATCH 38/59] add _verbose_repr method --- pandas/io/formats/info.py | 230 ++++++++++++++++++++++++-------------- 1 file changed, 148 insertions(+), 82 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 6950f20148bae..a275223d761e3 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -143,84 +143,6 @@ def get_info(self) -> None: show_counts = self.null_counts exceeds_info_cols = col_count > max_cols - def _verbose_repr(): - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - - header = _put_str(id_head, space_num) - - if isinstance(self.data, ABCDataFrame): - lines.append(f"Data columns (total {col_count} columns):") - len_column = len(pprint_thing(column_head)) - header += _put_str( - column_head, self.space(max_col, len_column, col_space) - ) - else: - lines.append(f"Series name: {self.data.name}") - - if show_counts: - counts = self._get_counts() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype - ) - - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, self.space(max_col, len_column, col_space)) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) - ) - - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - - lines.append( - line_no - + _put_str(col, self.space(max_col, len_column, col_space)) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) - - def _non_verbose_repr(): - if isinstance(self.data, ABCDataFrame): - lines.append(ids._summary(name="Columns")) - def _sizeof_fmt(num, size_qualifier): # returns size in human readable format for x in ["bytes", "KB", "MB", "GB", "TB"]: @@ -230,14 +152,14 @@ def _sizeof_fmt(num, size_qualifier): return f"{num:3.1f}{size_qualifier} PB" if self.verbose: - _verbose_repr() + self._verbose_repr(lines, ids, dtypes, show_counts) elif self.verbose is False: # specifically set to False, not necessarily None - _non_verbose_repr() + self._non_verbose_repr(lines, ids) else: if exceeds_info_cols: - _non_verbose_repr() + self._non_verbose_repr(lines, ids) else: - _verbose_repr() + self._verbose_repr(lines, ids, dtypes, show_counts) # groupby dtype.name to collect e.g. Categorical columns counts = dtypes.value_counts().groupby(lambda x: x.name).sum() @@ -272,6 +194,12 @@ def _get_counts(self) -> "Series": def space(self, max_col, len_column, col_space): raise NotImplementedError + def _verbose_repr(self, lines, ids, dtypes, show_counts): + raise NotImplementedError + + def _non_verbose_repr(self, lines, ids): + raise NotImplementedError + class DataFrameInfo(Info): def _get_mem_usage(self, deep: bool) -> int: @@ -331,6 +259,82 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: def space(self, max_col, len_column, col_space): return max(max_col, len_column) + col_space + def _verbose_repr(self, lines, ids, dtypes, show_counts): + + id_head = " # " + column_head = "Column" + col_space = 2 + col_count = len(ids) + + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space + + header = _put_str(id_head, space_num) + + if isinstance(self.data, ABCDataFrame): + lines.append(f"Data columns (total {col_count} columns):") + len_column = len(pprint_thing(column_head)) + header += _put_str(column_head, self.space(max_col, len_column, col_space)) + else: + lines.append(f"Series name: {self.data.name}") + + if show_counts: + counts = self._get_counts() + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, self.space(max_col, len_column, col_space)) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + for i, col in enumerate(ids): + dtype = dtypes[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts[i] + + lines.append( + line_no + + _put_str(col, self.space(max_col, len_column, col_space)) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + def _non_verbose_repr(self, lines, ids): + lines.append(ids._summary(name="Columns")) + class SeriesInfo(Info): def _get_mem_usage(self, deep: bool) -> int: @@ -389,3 +393,65 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: def space(self, max_col, len_column, col_space): return 0 + + def _verbose_repr(self, lines, ids, dtypes, show_counts): + + id_head = " # " + col_count = len(ids) + col_space = 2 + + len_id = len(pprint_thing(id_head)) + space_num = 3 + col_space + + header = _put_str(id_head, space_num) + + lines.append(f"Series name: {self.data.name}") + + if show_counts: + counts = self._get_counts() + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + dtype = self.data.dtype + + line_no = _put_str(f" {0}", space_num) + count = "" + if show_counts: + count = self.data.count() + + lines.append( + line_no + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + def _non_verbose_repr(self, lines, ids): + pass From c04dabf94aa0d4917bbca3fa0e1a4bbe60369801 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 7 Jun 2020 21:31:16 +0100 Subject: [PATCH 39/59] wip --- pandas/io/formats/info.py | 35 +++++++--------------------- pandas/tests/io/formats/test_info.py | 14 +++++------ 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a275223d761e3..b1adc2e67c839 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -396,30 +396,22 @@ def space(self, max_col, len_column, col_space): def _verbose_repr(self, lines, ids, dtypes, show_counts): - id_head = " # " - col_count = len(ids) - col_space = 2 + id_space = 2 - len_id = len(pprint_thing(id_head)) - space_num = 3 + col_space - - header = _put_str(id_head, space_num) + header = "" lines.append(f"Series name: {self.data.name}") if show_counts: - counts = self._get_counts() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) + count = self.data.count() count_header = "Non-Null Count" len_count = len(count_header) non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space + max_count = len(pprint_thing(self.data.count())) + len(non_null) + space_count = max(len_count, max_count) + id_space count_temp = "{count}" + non_null else: + count = "" count_header = "" space_count = len(count_header) len_count = space_count @@ -435,22 +427,13 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): lines.append(header) lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_count, space_count) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) - dtype = self.data.dtype - - line_no = _put_str(f" {0}", space_num) - count = "" - if show_counts: - count = self.data.count() - lines.append( - line_no - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtypes[0], space_dtype) ) def _non_verbose_repr(self, lines, ids): diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 049c339665b72..23f66389b1436 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -105,9 +105,9 @@ def test_info_series(verbose): """ if verbose: expected += """Series name: sth - # Non-Null Count Dtype ---- -------------- ----- - 0 10 non-null int64 +Non-Null Count Dtype +-------------- ----- +10 non-null int64 """ expected += f"""dtypes: int64(1) memory usage: {s.memory_usage()}.0+ bytes @@ -169,9 +169,9 @@ def test_info_memory(): RangeIndex: 2 entries, 0 to 1 Series name: None - # Non-Null Count Dtype - --- -------------- ----- - 0 2 non-null int64 + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 dtypes: int64(1) memory usage: {bytes} bytes """ @@ -254,7 +254,7 @@ def test_info_shows_column_dtypes(): buf = StringIO() s.info(buf=buf) res = buf.getvalue() - name = f" 0 {n:d} non-null {dtype}" + name = f"{n:d} non-null {dtype}" assert name in res From d9993eeb52859d14a5530ced439d69bf8c0bdc25 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 09:49:07 +0100 Subject: [PATCH 40/59] some typing / removing unnecessary methods --- pandas/io/formats/info.py | 154 +++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 87 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b1adc2e67c839..0ff7b05854f22 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,11 +1,11 @@ import sys -from typing import IO, TYPE_CHECKING, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries -from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.dtypes.generic import ABCSeries from pandas.core.indexes.api import Index @@ -116,6 +116,18 @@ def __init__( self.memory_usage = memory_usage self.null_counts = null_counts + def _get_mem_usage(self, deep: bool) -> int: + raise NotImplementedError + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + raise NotImplementedError + + def _verbose_repr(self, lines, ids, dtypes, show_counts): + raise NotImplementedError + + def _non_verbose_repr(self, lines, ids): + raise NotImplementedError + def get_info(self) -> None: lines = [] @@ -182,34 +194,14 @@ def _sizeof_fmt(num, size_qualifier): lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(self.buf, lines) - def _get_mem_usage(self, deep: bool) -> int: - raise NotImplementedError - - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - raise NotImplementedError - - def _get_counts(self) -> "Series": - raise NotImplementedError - - def space(self, max_col, len_column, col_space): - raise NotImplementedError - - def _verbose_repr(self, lines, ids, dtypes, show_counts): - raise NotImplementedError - - def _non_verbose_repr(self, lines, ids): - raise NotImplementedError - class DataFrameInfo(Info): def _get_mem_usage(self, deep: bool) -> int: """ - Get DataFrame or Series' memory usage in bytes. + Get DataFrame's memory usage in bytes. Parameters ---------- - data : DataFrame or Series - Object that `info` was called on. deep : bool If True, introspect the data deeply by interrogating object dtypes for system-level memory consumption, and include it in the returned @@ -222,45 +214,36 @@ def _get_mem_usage(self, deep: bool) -> int: """ return self.data.memory_usage(index=True, deep=deep).sum() - def _get_counts(self) -> "Series": + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: """ - Get DataFrame or Series' non-NA counts. - - Parameters - ---------- - data : DataFrame or Series - Object that `info` was called on. + Get DataFrame's column names and dtypes. Returns ------- - counts : Series - Count non-NA cells (for each column in the DataFrame case). + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. """ - return self.data.count() + return self.data.columns, self.data.dtypes - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: """ - Get DataFrame or Series' columns/name and dtypes. + Display name, non-null count (optionally), and dtype for each column. Parameters ---------- - data : DataFrame or Series - Object that `info` was called on. - - Returns - ------- + lines : List[str] + Lines that will contain `info` representation. ids : Index - DataFrame's columns or Series' name. + The DataFrame's column names. dtypes : Series - Dtype of each of the DataFrame's columns or the Series' dtype. + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. """ - return self.data.columns, self.data.dtypes - - def space(self, max_col, len_column, col_space): - return max(max_col, len_column) + col_space - - def _verbose_repr(self, lines, ids, dtypes, show_counts): - id_head = " # " column_head = "Column" col_space = 2 @@ -268,6 +251,7 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): max_col = max(len(pprint_thing(k)) for k in ids) len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space max_id = len(pprint_thing(col_count)) len_id = len(pprint_thing(id_head)) @@ -275,15 +259,12 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): header = _put_str(id_head, space_num) - if isinstance(self.data, ABCDataFrame): - lines.append(f"Data columns (total {col_count} columns):") - len_column = len(pprint_thing(column_head)) - header += _put_str(column_head, self.space(max_col, len_column, col_space)) - else: - lines.append(f"Series name: {self.data.name}") + lines.append(f"Data columns (total {col_count} columns):") + len_column = len(pprint_thing(column_head)) + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: - counts = self._get_counts() + counts = self.data.count() if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" @@ -311,7 +292,7 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): lines.append(header) lines.append( _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, self.space(max_col, len_column, col_space)) + + _put_str("-" * len_column, space) + _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) @@ -327,24 +308,32 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): lines.append( line_no - + _put_str(col, self.space(max_col, len_column, col_space)) + + _put_str(col, space) + _put_str(count_temp.format(count=count), space_count) + _put_str(dtype, space_dtype) ) - def _non_verbose_repr(self, lines, ids): + def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ lines.append(ids._summary(name="Columns")) class SeriesInfo(Info): def _get_mem_usage(self, deep: bool) -> int: """ - Get DataFrame or Series' memory usage in bytes. + Get Series' memory usage in bytes. Parameters ---------- - data : DataFrame or Series - Object that `info` was called on. deep : bool If True, introspect the data deeply by interrogating object dtypes for system-level memory consumption, and include it in the returned @@ -357,44 +346,35 @@ def _get_mem_usage(self, deep: bool) -> int: """ return self.data.memory_usage(index=True, deep=deep) - def _get_counts(self) -> "Series": + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: """ - Get DataFrame or Series' non-NA counts. - - Parameters - ---------- - data : DataFrame or Series - Object that `info` was called on. + Get Series' name and dtypes. Returns ------- - counts : Series - Count non-NA cells (for each column in the DataFrame case). + ids : Index + Series' name. + dtypes : Series + Series' dtype. """ - return self.data._constructor(self.data.count()) + assert isinstance(self.data, ABCSeries) # help mypy + return Index([self.data.name]), self.data._constructor(self.data.dtypes) - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + def _verbose_repr(self, lines, ids, dtypes, show_counts) -> None: """ - Get DataFrame or Series' columns/name and dtypes. + Display name, non-null count (optionally), and dtype. Parameters ---------- - data : DataFrame or Series - Object that `info` was called on. - - Returns - ------- + lines : List[str] + Lines that will contain `info` representation. ids : Index - DataFrame's columns or Series' name. + The Series' name. dtypes : Series - Dtype of each of the DataFrame's columns or the Series' dtype. + The Series' dtype. + show_counts : bool + If True, count of non-NA cells will be appended to `lines`. """ - return Index([self.data.name]), self.data._constructor(self.data.dtypes) - - def space(self, max_col, len_column, col_space): - return 0 - - def _verbose_repr(self, lines, ids, dtypes, show_counts): id_space = 2 From ad39d8593ce00301c66fba011d8e28e67cfa445f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 30 Jun 2020 19:11:37 +0100 Subject: [PATCH 41/59] resolve better --- pandas/io/formats/info.py | 108 +++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c0d73369caab6..d1b47268664fe 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -286,68 +286,68 @@ def _verbose_repr( col_count = len(ids) lines.append(f"Data columns (total {col_count} columns):") - lines.append(str(type(self.data))) - lines.append(self.data.index._summary()) + id_head = " # " + column_head = "Column" + col_space = 2 - ids, dtypes = self._get_ids_and_dtypes() - col_count = len(ids) + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space - if col_count == 0: - lines.append(f"Empty {type(self.data).__name__}") - fmt.buffer_put_lines(self.buf, lines) - return + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space - # hack - max_cols = self.max_cols - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(self.data) + 1) - - if self.null_counts is None: - show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + header = _put_str(id_head, space_num) + _put_str(column_head, space) + if show_counts: + counts = self.data.count() + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null else: - show_counts = self.null_counts - exceeds_info_cols = col_count > max_cols + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) - if self.verbose: - self._verbose_repr(lines, ids, dtypes, show_counts) - elif self.verbose is False: # specifically set to False, not necessarily None - self._non_verbose_repr(lines, ids) - else: - if exceeds_info_cols: - self._non_verbose_repr(lines, ids) - else: - self._verbose_repr(lines, ids, dtypes, show_counts) + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") + for i, col in enumerate(ids): + dtype = dtypes[i] + col = pprint_thing(col) - if self.memory_usage: - # append memory usage of df to display - size_qualifier = "" - if self.memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self._get_mem_usage(deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(self.buf, lines) + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: lines.append(ids._summary(name="Columns")) From cad139115591f91921a0e7ad33e801b7dd5c3f3f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 30 Jun 2020 19:13:53 +0100 Subject: [PATCH 42/59] remove docstrings from inherited class --- pandas/io/formats/info.py | 44 --------------------------------------- 1 file changed, 44 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d1b47268664fe..105afb8942e62 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -6,8 +6,6 @@ from pandas._typing import Dtype, FrameOrSeries -from pandas.core.dtypes.generic import ABCSeries - from pandas.core.indexes.api import Index from pandas.io.formats import format as fmt @@ -355,55 +353,13 @@ def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: class SeriesInfo(BaseInfo): def _get_mem_usage(self, deep: bool) -> int: - """ - Get Series' memory usage in bytes. - - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. - - Returns - ------- - mem_usage : int - Object's total memory usage in bytes. - """ return self.data.memory_usage(index=True, deep=deep) def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - """ - Get Series' name and dtypes. - - Returns - ------- - ids : Index - Series' name. - dtypes : Series - Series' dtype. - """ - assert isinstance(self.data, ABCSeries) # help mypy return Index([self.data.name]), self.data._constructor(self.data.dtypes) def _verbose_repr(self, lines, ids, dtypes, show_counts) -> None: - """ - Display name, non-null count (optionally), and dtype. - - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The Series' name. - dtypes : Series - The Series' dtype. - show_counts : bool - If True, count of non-NA cells will be appended to `lines`. - """ - id_space = 2 - header = "" lines.append(f"Series name: {self.data.name}") From a53033bb2ac0df31f3b5c759f692731354c83deb Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 30 Jun 2020 19:19:21 +0100 Subject: [PATCH 43/59] fix typing --- pandas/core/series.py | 6 +++--- pandas/io/formats/info.py | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 21b30abaaeab9..1b56f9731f2eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -92,7 +92,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt -from pandas.io.formats.info import Info, SeriesInfo +from pandas.io.formats.info import SeriesInfo import pandas.plotting if TYPE_CHECKING: @@ -4630,7 +4630,7 @@ def replace( Series.memory_usage: Memory usage of Series.""" ), ) - @doc(Info) + @doc(SeriesInfo) def info( self, verbose: Optional[bool] = None, @@ -4646,7 +4646,7 @@ def info( ) return SeriesInfo( self, verbose, buf, max_cols, memory_usage, null_counts - ).get_info() + ).info() @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 105afb8942e62..a7c3596d47948 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union, cast from pandas._config import get_option @@ -356,7 +356,10 @@ def _get_mem_usage(self, deep: bool) -> int: return self.data.memory_usage(index=True, deep=deep) def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return Index([self.data.name]), self.data._constructor(self.data.dtypes) + return ( + Index([self.data.name]), + cast("Series", self.data._constructor(self.data.dtypes)), + ) def _verbose_repr(self, lines, ids, dtypes, show_counts) -> None: id_space = 2 @@ -398,5 +401,5 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts) -> None: + _put_str(dtypes[0], space_dtype) ) - def _non_verbose_repr(self, lines, ids): + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: pass From f0e2290bb1b9bf1615e71e823bc2d8f5cbe3dbe6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 30 Jun 2020 19:53:21 +0100 Subject: [PATCH 44/59] :art: --- pandas/io/formats/info.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a7c3596d47948..5f812fa39ba39 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -356,12 +356,13 @@ def _get_mem_usage(self, deep: bool) -> int: return self.data.memory_usage(index=True, deep=deep) def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return ( - Index([self.data.name]), - cast("Series", self.data._constructor(self.data.dtypes)), - ) + ids = Index([self.data.name]) + dtypes = cast("Series", self.data._constructor(self.data.dtypes)) + return ids, dtypes - def _verbose_repr(self, lines, ids, dtypes, show_counts) -> None: + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: id_space = 2 header = "" From 53e8c2095f3f2ac5f56a9c3e2e35d6d79b62ef03 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 1 Jul 2020 08:01:30 +0100 Subject: [PATCH 45/59] :art:, fix doctests --- pandas/core/series.py | 18 +++++++++--------- pandas/io/formats/info.py | 9 ++++----- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1b56f9731f2eb..ad79af8c09da4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4573,9 +4573,9 @@ def replace( Int64Index: 5 entries, 1 to 5 Series name: None - # Non-Null Count Dtype - --- -------------- ----- - 0 5 non-null object + Non-Null Count Dtype + -------------- ----- + 5 non-null object dtypes: object(1) memory usage: 80.0+ bytes @@ -4608,9 +4608,9 @@ def replace( RangeIndex: 1000000 entries, 0 to 999999 Series name: None - # Non-Null Count Dtype - --- -------------- ----- - 0 1000000 non-null object + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object dtypes: object(1) memory usage: 7.6+ MB @@ -4618,9 +4618,9 @@ def replace( RangeIndex: 1000000 entries, 0 to 999999 Series name: None - # Non-Null Count Dtype - --- -------------- ----- - 0 1000000 non-null object + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object dtypes: object(1) memory usage: 62.9 MB""" ), diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 5f812fa39ba39..e2821f68d1fc4 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -363,17 +363,16 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: def _verbose_repr( self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool ) -> None: - id_space = 2 - header = "" - lines.append(f"Series name: {self.data.name}") + id_space = 2 + if show_counts: count = self.data.count() count_header = "Non-Null Count" len_count = len(count_header) non_null = " non-null" - max_count = len(pprint_thing(self.data.count())) + len(non_null) + max_count = len(pprint_thing(count)) + len(non_null) space_count = max(len_count, max_count) + id_space count_temp = "{count}" + non_null else: @@ -387,7 +386,7 @@ def _verbose_repr( len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( + header = _put_str(count_header, space_count) + _put_str( dtype_header, space_dtype ) From ee717c8aa637af546c0b2a05abbf86ab5cc14f5b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 1 Jul 2020 21:08:21 +0100 Subject: [PATCH 46/59] factor out _get_count_configs --- pandas/io/formats/info.py | 43 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e2821f68d1fc4..730d1931a1de5 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -72,6 +72,28 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" +def _get_count_configs( + counts: "Series", col_count: int, col_space: int, show_counts: bool +) -> Tuple[str, int, int, str]: + if show_counts: + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + return count_header, space_count, len_count, count_temp + + class BaseInfo(metaclass=ABCMeta): def __init__( self, @@ -297,23 +319,10 @@ def _verbose_repr( space_num = max(max_id, len_id) + col_space header = _put_str(id_head, space_num) + _put_str(column_head, space) - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" + counts = self.data.count() + count_header, space_count, len_count, count_temp = _get_count_configs( + counts, col_count, col_space, show_counts + ) dtype_header = "Dtype" len_dtype = len(dtype_header) From 4d7a211dcdf5522e8ed22aefbfdb616f5208fa4a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 1 Jul 2020 21:12:48 +0100 Subject: [PATCH 47/59] factor _get_count_configs out of Series._verbose_repr as well --- pandas/io/formats/info.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 730d1931a1de5..127dfddfd6903 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -73,10 +73,10 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: def _get_count_configs( - counts: "Series", col_count: int, col_space: int, show_counts: bool + counts: "Series", col_space: int, show_counts: bool, col_count: Optional[int] = None ) -> Tuple[str, int, int, str]: if show_counts: - if col_count != len(counts): # pragma: no cover + if col_count is not None and col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" ) @@ -321,7 +321,7 @@ def _verbose_repr( header = _put_str(id_head, space_num) + _put_str(column_head, space) counts = self.data.count() count_header, space_count, len_count, count_temp = _get_count_configs( - counts, col_count, col_space, show_counts + counts, col_space, show_counts, col_count ) dtype_header = "Dtype" @@ -376,20 +376,10 @@ def _verbose_repr( id_space = 2 - if show_counts: - count = self.data.count() - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = len(pprint_thing(count)) + len(non_null) - space_count = max(len_count, max_count) + id_space - count_temp = "{count}" + non_null - else: - count = "" - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" + counts = self.data._constructor(self.data.count()) + count_header, space_count, len_count, count_temp = _get_count_configs( + counts, id_space, show_counts + ) dtype_header = "Dtype" len_dtype = len(dtype_header) @@ -404,11 +394,12 @@ def _verbose_repr( _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) - - lines.append( - _put_str(count_temp.format(count=count), space_count) - + _put_str(dtypes[0], space_dtype) - ) + for count in counts: + # TODO factor this out too + lines.append( + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtypes[0], space_dtype) + ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: pass From 6eccf006f9ffe84a045503c8ad7c9b80af03e9e2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 2 Jul 2020 17:58:05 +0100 Subject: [PATCH 48/59] fix typing --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 127dfddfd6903..c9916ea520a7f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -376,7 +376,7 @@ def _verbose_repr( id_space = 2 - counts = self.data._constructor(self.data.count()) + counts = cast("Series", self.data._constructor(self.data.count())) count_header, space_count, len_count, count_temp = _get_count_configs( counts, id_space, show_counts ) From 81d22ebfd09b0f8b4ec47dbb6cd41b188f32f62a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 2 Jul 2020 18:14:05 +0100 Subject: [PATCH 49/59] factor out _display_counts_and_dtypes --- pandas/io/formats/info.py | 74 ++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c9916ea520a7f..182aa61b3d127 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -75,6 +75,7 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: def _get_count_configs( counts: "Series", col_space: int, show_counts: bool, col_count: Optional[int] = None ) -> Tuple[str, int, int, str]: + # TODO: 1 add docstring, 2 check if we really need pragma: no cover if show_counts: if col_count is not None and col_count != len(counts): # pragma: no cover raise AssertionError( @@ -94,6 +95,35 @@ def _get_count_configs( return count_header, space_count, len_count, count_temp +def _display_counts_and_dtypes( + lines: List[str], + ids: "Series", + dtypes: "Series", + show_counts: bool, + counts: "Series", + count_temp: str, + space_count: int, + space_dtype: int, + space: int = 0, + space_num: int = 0, +) -> None: + for i, col in enumerate(ids): + dtype = dtypes[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + class BaseInfo(metaclass=ABCMeta): def __init__( self, @@ -340,21 +370,18 @@ def _verbose_repr( + _put_str("-" * len_dtype, space_dtype) ) - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) + _display_counts_and_dtypes( + lines, + ids, + dtypes, + show_counts, + counts, + count_temp, + space_count, + space_dtype, + space, + space_num, + ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: lines.append(ids._summary(name="Columns")) @@ -394,12 +421,17 @@ def _verbose_repr( _put_str("-" * len_count, space_count) + _put_str("-" * len_dtype, space_dtype) ) - for count in counts: - # TODO factor this out too - lines.append( - _put_str(count_temp.format(count=count), space_count) - + _put_str(dtypes[0], space_dtype) - ) + + _display_counts_and_dtypes( + lines, + ids, + dtypes, + show_counts, + counts, + count_temp, + space_count, + space_dtype, + ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: pass From f2ca52050f8921b5cd96e3195577380cd0976215 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 2 Jul 2020 18:38:36 +0100 Subject: [PATCH 50/59] fix typing, factor out _get_header_and_spaces --- pandas/io/formats/info.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 182aa61b3d127..31f9d925926ff 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -97,7 +97,7 @@ def _get_count_configs( def _display_counts_and_dtypes( lines: List[str], - ids: "Series", + ids: "Index", dtypes: "Series", show_counts: bool, counts: "Series", @@ -124,6 +124,17 @@ def _display_counts_and_dtypes( ) +def _get_header_and_spaces( + dtypes: "Series", space_count: int, count_header: str, header: str = "" +) -> Tuple[int, str, int]: + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str(dtype_header, space_dtype) + return space_dtype, header, len_dtype + + class BaseInfo(metaclass=ABCMeta): def __init__( self, @@ -354,12 +365,8 @@ def _verbose_repr( counts, col_space, show_counts, col_count ) - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype + space_dtype, header, len_dtype = _get_header_and_spaces( + dtypes, space_count, count_header, header ) lines.append(header) @@ -408,12 +415,8 @@ def _verbose_repr( counts, id_space, show_counts ) - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header = _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype + space_dtype, header, len_dtype = _get_header_and_spaces( + dtypes, space_count, count_header ) lines.append(header) From 669ff38aaf0f9aa32cca4f88e683c952fece5ae2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 4 Jul 2020 10:14:28 +0100 Subject: [PATCH 51/59] document _get_count_configs --- pandas/io/formats/info.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 31f9d925926ff..f45c491afd1f7 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -75,7 +75,32 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: def _get_count_configs( counts: "Series", col_space: int, show_counts: bool, col_count: Optional[int] = None ) -> Tuple[str, int, int, str]: - # TODO: 1 add docstring, 2 check if we really need pragma: no cover + """ + Get configs for displaying counts, depending on the value of `show_counts`. + + Parameters + ---------- + counts : Series + Non-null count of Series (or of each column of DataFrame). + col_space : int + How many space to leave between non-null count and dtype columns. + show_counts : bool + Whether to display non-null counts. + col_count : int, optional + Number of columns in DataFrame. + + Returns + ------- + count_header : str + Header that will be printed out above non-null counts in output. + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + len_count : int + Length of count header. + count_temp : str + String that can be formatted to include non-null count. + """ if show_counts: if col_count is not None and col_count != len(counts): # pragma: no cover raise AssertionError( @@ -107,6 +132,9 @@ def _display_counts_and_dtypes( space: int = 0, space_num: int = 0, ) -> None: + """ + Display count and dtype of Series (or of each column of Frame). + """ for i, col in enumerate(ids): dtype = dtypes[i] col = pprint_thing(col) @@ -127,6 +155,9 @@ def _display_counts_and_dtypes( def _get_header_and_spaces( dtypes: "Series", space_count: int, count_header: str, header: str = "" ) -> Tuple[int, str, int]: + """ + Append extra columns (count and type) to header, if applicable. + """ dtype_header = "Dtype" len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) From 6f8f8b11067ae9e02454290964d89ce353786419 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 4 Jul 2020 10:38:09 +0100 Subject: [PATCH 52/59] document _display_counts_and_dtypes and _get_header_and_spaces --- pandas/io/formats/info.py | 51 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index f45c491afd1f7..616b0ad85deaf 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -133,7 +133,33 @@ def _display_counts_and_dtypes( space_num: int = 0, ) -> None: """ - Display count and dtype of Series (or of each column of Frame). + Append count and dtype of Series (or of each column of Frame) to `lines`. + + Parameters + ---------- + lines : List[str] + At this stage, this contains the main header and the info table headers. + ids : Index + Series name (or names of DataFrame columns). + dtypes : Series + Series dtype (or dtypes of DataFrame columns). + show_counts : bool + Whether to show non-null counts. + counts : Series + Non-null counts of Series (or of each of DataFrame's columns). + count_temp : str + String that can be formatted to include non-null count. + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + space_dtype : int + Number of spaces that `dtypes` column should occupy. + space : int = 0 + Number of spaces that `Column` header should occupy + (including space before `non-null count` column). + space_num : int = 0 + Number of spaces that ` # ` header should occupy (including space + before `Column` column), only applicable for `DataFrame.info`. """ for i, col in enumerate(ids): dtype = dtypes[i] @@ -157,12 +183,35 @@ def _get_header_and_spaces( ) -> Tuple[int, str, int]: """ Append extra columns (count and type) to header, if applicable. + + Parameters + ---------- + dtypes : Series + Series dtype (or dtypes of DataFrame columns). + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + count_header : str + Header that will be printed out above non-null counts in output. + header : str + Current header. + + Returns + ------- + space_dtype : int + Number of spaces that `dtypes` column should occupy. + header : str + Header with extra columns (count and type) appended. + len_dtype : int + Length of dtype header. """ + breakpoint() dtype_header = "Dtype" len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) header += _put_str(count_header, space_count) + _put_str(dtype_header, space_dtype) + breakpoint() return space_dtype, header, len_dtype From 97dc73cf85c5e8ac05cc707896fc6f4f5946d4db Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 4 Jul 2020 10:40:57 +0100 Subject: [PATCH 53/59] remove breakpoints --- pandas/io/formats/info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 616b0ad85deaf..e81ec54960f68 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -205,13 +205,11 @@ def _get_header_and_spaces( len_dtype : int Length of dtype header. """ - breakpoint() dtype_header = "Dtype" len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) header += _put_str(count_header, space_count) + _put_str(dtype_header, space_dtype) - breakpoint() return space_dtype, header, len_dtype From 0707f32ea18058f029d8a222f9b62e5e3bd7d207 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 4 Jul 2020 10:48:02 +0100 Subject: [PATCH 54/59] fix docstring substitution --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ad79af8c09da4..32d715fecba51 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4630,7 +4630,7 @@ def replace( Series.memory_usage: Memory usage of Series.""" ), ) - @doc(SeriesInfo) + @doc(SeriesInfo.info) def info( self, verbose: Optional[bool] = None, From ddf9efc0ab25b011538cb069f18b14a2f74e873b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 19 Sep 2020 16:47:04 +0100 Subject: [PATCH 55/59] fix failing doctest --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 55be5629f07d2..6f2ea68a78506 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4614,7 +4614,7 @@ def replace( -------------- ----- 1000000 non-null object dtypes: object(1) - memory usage: 62.9 MB""" + memory usage: 55.3 MB""" ), see_also_sub=( """ From 21d94b235096f0c8c8726dd550a3ad48e09b2c65 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 19 Sep 2020 17:20:43 +0100 Subject: [PATCH 56/59] use CountConfigs namedtuple --- pandas/core/series.py | 2 +- pandas/io/formats/info.py | 109 ++++++++++++++++++++++---------------- 2 files changed, 64 insertions(+), 47 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6f2ea68a78506..55be5629f07d2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4614,7 +4614,7 @@ def replace( -------------- ----- 1000000 non-null object dtypes: object(1) - memory usage: 55.3 MB""" + memory usage: 62.9 MB""" ), see_also_sub=( """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e81ec54960f68..0b92495603ebb 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union, cast +from typing import IO, TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union, cast from pandas._config import get_option @@ -15,6 +15,49 @@ from pandas.core.series import Series # noqa: F401 +class CountConfigs(NamedTuple): + """ + Configs with which to display counts. + + Attributes + ---------- + counts : Series + Non-null count of Series (or of each column of DataFrame). + count_header : str + Header that will be printed out above non-null counts in output. + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + len_count : int + Length of count header. + count_temp : str + String that can be formatted to include non-null count. + """ + + counts: "Series" + count_header: str + space_count: int + len_count: int + count_temp: str + + +class HeaderAndSpaceConfigs(NamedTuple): + """ + Attributes + ---------- + space_dtype : int + Number of spaces that `dtypes` column should occupy. + header : str + Header with extra columns (count and type) appended. + len_dtype : int + Length of dtype header. + """ + + space_dtype: int + header: str + len_dtype: int + + def _put_str(s: Union[str, Dtype], space: int) -> str: """ Make string of specified length, padding to the right if necessary. @@ -74,7 +117,7 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: def _get_count_configs( counts: "Series", col_space: int, show_counts: bool, col_count: Optional[int] = None -) -> Tuple[str, int, int, str]: +) -> CountConfigs: """ Get configs for displaying counts, depending on the value of `show_counts`. @@ -91,15 +134,7 @@ def _get_count_configs( Returns ------- - count_header : str - Header that will be printed out above non-null counts in output. - space_count : int - Number of spaces that count_header should occupy - (including space before `dtypes` column). - len_count : int - Length of count header. - count_temp : str - String that can be formatted to include non-null count. + CountConfigs """ if show_counts: if col_count is not None and col_count != len(counts): # pragma: no cover @@ -117,7 +152,7 @@ def _get_count_configs( space_count = len(count_header) len_count = space_count count_temp = "{count}" - return count_header, space_count, len_count, count_temp + return CountConfigs(counts, count_header, space_count, len_count, count_temp) def _display_counts_and_dtypes( @@ -125,9 +160,7 @@ def _display_counts_and_dtypes( ids: "Index", dtypes: "Series", show_counts: bool, - counts: "Series", - count_temp: str, - space_count: int, + count_configs: CountConfigs, space_dtype: int, space: int = 0, space_num: int = 0, @@ -145,13 +178,8 @@ def _display_counts_and_dtypes( Series dtype (or dtypes of DataFrame columns). show_counts : bool Whether to show non-null counts. - counts : Series - Non-null counts of Series (or of each of DataFrame's columns). - count_temp : str - String that can be formatted to include non-null count. - space_count : int - Number of spaces that count_header should occupy - (including space before `dtypes` column). + count_configs: CountConfigs + Configs with which to display counts. space_dtype : int Number of spaces that `dtypes` column should occupy. space : int = 0 @@ -168,12 +196,14 @@ def _display_counts_and_dtypes( line_no = _put_str(f" {i}", space_num) count = "" if show_counts: - count = counts[i] + count = count_configs.counts[i] lines.append( line_no + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) + + _put_str( + count_configs.count_temp.format(count=count), count_configs.space_count + ) + _put_str(dtype, space_dtype) ) @@ -439,20 +469,18 @@ def _verbose_repr( header = _put_str(id_head, space_num) + _put_str(column_head, space) counts = self.data.count() - count_header, space_count, len_count, count_temp = _get_count_configs( - counts, col_space, show_counts, col_count - ) + count_configs = _get_count_configs(counts, col_space, show_counts, col_count) space_dtype, header, len_dtype = _get_header_and_spaces( - dtypes, space_count, count_header, header + dtypes, count_configs.space_count, count_configs.count_header, header ) lines.append(header) lines.append( _put_str("-" * len_id, space_num) + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) + + _put_str("-" * count_configs.len_count, count_configs.space_count) + + _put_str("-" * len_dtype, space_dtype,) ) _display_counts_and_dtypes( @@ -460,9 +488,7 @@ def _verbose_repr( ids, dtypes, show_counts, - counts, - count_temp, - space_count, + count_configs, space_dtype, space, space_num, @@ -489,29 +515,20 @@ def _verbose_repr( id_space = 2 counts = cast("Series", self.data._constructor(self.data.count())) - count_header, space_count, len_count, count_temp = _get_count_configs( - counts, id_space, show_counts - ) + count_configs = _get_count_configs(counts, id_space, show_counts) space_dtype, header, len_dtype = _get_header_and_spaces( - dtypes, space_count, count_header + dtypes, count_configs.space_count, count_configs.count_header ) lines.append(header) lines.append( - _put_str("-" * len_count, space_count) + _put_str("-" * count_configs.len_count, count_configs.space_count) + _put_str("-" * len_dtype, space_dtype) ) _display_counts_and_dtypes( - lines, - ids, - dtypes, - show_counts, - counts, - count_temp, - space_count, - space_dtype, + lines, ids, dtypes, show_counts, count_configs, space_dtype, ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: From a213d9c11a676e5ea90789fc6244c84d93a39cf0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 19 Sep 2020 17:24:04 +0100 Subject: [PATCH 57/59] :fire: --- pandas/io/formats/info.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0b92495603ebb..256c546284875 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -41,23 +41,6 @@ class CountConfigs(NamedTuple): count_temp: str -class HeaderAndSpaceConfigs(NamedTuple): - """ - Attributes - ---------- - space_dtype : int - Number of spaces that `dtypes` column should occupy. - header : str - Header with extra columns (count and type) appended. - len_dtype : int - Length of dtype header. - """ - - space_dtype: int - header: str - len_dtype: int - - def _put_str(s: Union[str, Dtype], space: int) -> str: """ Make string of specified length, padding to the right if necessary. From 089ce2452abf7553ae526cc3e9ba2a774af841ce Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 19 Sep 2020 17:25:35 +0100 Subject: [PATCH 58/59] remove trailing comma --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 256c546284875..cbb5ae9750c0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -463,7 +463,7 @@ def _verbose_repr( _put_str("-" * len_id, space_num) + _put_str("-" * len_column, space) + _put_str("-" * count_configs.len_count, count_configs.space_count) - + _put_str("-" * len_dtype, space_dtype,) + + _put_str("-" * len_dtype, space_dtype) ) _display_counts_and_dtypes( From 45813853c70f92ee887fecac9e1fcf68128108ee Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 19 Sep 2020 17:26:29 +0100 Subject: [PATCH 59/59] fix failing doctest --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 55be5629f07d2..6f2ea68a78506 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4614,7 +4614,7 @@ def replace( -------------- ----- 1000000 non-null object dtypes: object(1) - memory usage: 62.9 MB""" + memory usage: 55.3 MB""" ), see_also_sub=( """