diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d12ebeafe8510..5134ddcf1cc67 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -140,7 +140,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import info +from pandas.io.formats.info import DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -2459,11 +2459,11 @@ def to_html( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2502,11 +2502,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2514,11 +2514,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB""" ), @@ -2529,7 +2529,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(info) + @doc(DataFrameInfo.info) def info( self, verbose: Optional[bool] = None, @@ -2538,7 +2538,9 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return info(self, verbose, buf, max_cols, memory_usage, null_counts) + return DataFrameInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).info() def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b1dcafa7a7a8f..7a53b46a4ac0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,15 +1,17 @@ +from abc import ABCMeta, abstractmethod import sys -from typing import IO, TYPE_CHECKING, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries +from pandas.core.indexes.api import Index + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.series import Series # noqa: F401 @@ -39,115 +41,247 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: +def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: """ - Get DataFrame's columns and dtypes. + Return size in human readable format. Parameters ---------- - data : DataFrame - Object that `info` was called on. + num : int + Size in bytes. + size_qualifier : str + Either empty, or '+' (if lower bound). Returns ------- - ids : Index - DataFrame's columns. - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - ids = data.columns - dtypes = data.dtypes - return ids, dtypes - - -def info( - data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, - memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, -) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s + str + Size in human readable format. Examples -------- - %(examples_sub)s - """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(data))) - lines.append(data.index._summary()) - - ids, dtypes = _get_ids_and_dtypes(data) - col_count = len(ids) - - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return - - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(data) + 1) + >>> _sizeof_fmt(23028, '') + '22.5 KB' - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols + >>> _sizeof_fmt(23028, '+') + '22.5+ KB' + """ + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + +class BaseInfo(metaclass=ABCMeta): + def __init__( + self, + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ): + if buf is None: # pragma: no cover + buf = sys.stdout + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + + self.data = data + self.verbose = verbose + self.buf = buf + self.max_cols = max_cols + self.memory_usage = memory_usage + self.null_counts = null_counts + + @abstractmethod + def _get_mem_usage(self, deep: bool) -> int: + """ + Get memory usage in bytes. + + Parameters + ---------- + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + pass + + @abstractmethod + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get column names and dtypes. + + Returns + ------- + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + pass + + @abstractmethod + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + """ + Append name, non-null count (optional), and dtype for each column to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + dtypes : Series + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. + """ + pass + + @abstractmethod + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ + pass + + def info(self) -> None: + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ + lines = [] + + lines.append(str(type(self.data))) + lines.append(self.data.index._summary()) + + ids, dtypes = self._get_ids_and_dtypes() + col_count = len(ids) + + if col_count == 0: + lines.append(f"Empty {type(self.data).__name__}") + fmt.buffer_put_lines(self.buf, lines) + return + + # hack + max_cols = self.max_cols + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self.data) + 1) + + if self.null_counts is None: + show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + else: + show_counts = self.null_counts + exceeds_info_cols = col_count > max_cols - def _verbose_repr(): + if self.verbose: + self._verbose_repr(lines, ids, dtypes, show_counts) + elif self.verbose is False: # specifically set to False, not necessarily None + self._non_verbose_repr(lines, ids) + else: + if exceeds_info_cols: + self._non_verbose_repr(lines, ids) + else: + self._verbose_repr(lines, ids, dtypes, show_counts) + + # groupby dtype.name to collect e.g. Categorical columns + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() + collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + if self.memory_usage: + # append memory usage of df to display + size_qualifier = "" + if self.memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self._get_mem_usage(deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(self.buf, lines) + + +class DataFrameInfo(BaseInfo): + def _get_mem_usage(self, deep: bool) -> int: + return self.data.memory_usage(index=True, deep=deep).sum() + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + return self.data.columns, self.data.dtypes + + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + col_count = len(ids) lines.append(f"Data columns (total {col_count} columns):") id_head = " # " @@ -164,7 +298,7 @@ def _verbose_repr(): header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: - counts = data.count() + counts = self.data.count() if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" @@ -213,46 +347,5 @@ def _verbose_repr(): + _put_str(dtype, space_dtype) ) - def _non_verbose_repr(): + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: lines.append(ids._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not necessarily None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = data.memory_usage(index=True, deep=deep).sum() - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines)