From 164ffd14ebb8173739c98d5b08051459067fc6f4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:17:04 +0100 Subject: [PATCH 01/15] make Info and DataFrameInfo subclasses --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 32 ++-- pandas/io/formats/info.py | 275 ++++++++++++++++++++------------- 3 files changed, 189 insertions(+), 119 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 92f7c0f6b59a3..872c2598cd9dd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -254,6 +254,7 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b522920ec9f23..35863d6dc8ea4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -140,7 +140,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import info +from pandas.io.formats.info import DataFrameInfo, Info import pandas.plotting if TYPE_CHECKING: @@ -2460,11 +2460,11 @@ def to_html( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2503,11 +2503,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2515,11 +2515,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB""" ), @@ -2530,7 +2530,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(info) + @doc(Info) def info( self, verbose: Optional[bool] = None, @@ -2539,7 +2539,9 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return info(self, verbose, buf, max_cols, memory_usage, null_counts) + return DataFrameInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).get_info() def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b1dcafa7a7a8f..f1409d3671832 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,15 +1,16 @@ import sys -from typing import IO, TYPE_CHECKING, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries +from pandas.core.indexes.api import Index + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.series import Series # noqa: F401 @@ -39,35 +40,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: - """ - Get DataFrame's columns and dtypes. - - Parameters - ---------- - data : DataFrame - Object that `info` was called on. - - Returns - ------- - ids : Index - DataFrame's columns. - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - ids = data.columns - dtypes = data.dtypes - return ids, dtypes - - -def info( - data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, - memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, -) -> None: +class Info: """ Print a concise summary of a %(klass)s. @@ -119,40 +92,160 @@ def info( -------- %(examples_sub)s """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(data))) - lines.append(data.index._summary()) - - ids, dtypes = _get_ids_and_dtypes(data) - col_count = len(ids) - - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return - - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(data) + 1) - - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols - - def _verbose_repr(): - lines.append(f"Data columns (total {col_count} columns):") + def __init__( + self, + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ): + if buf is None: # pragma: no cover + buf = sys.stdout + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + + self.data = data + self.verbose = verbose + self.buf = buf + self.max_cols = max_cols + self.memory_usage = memory_usage + self.null_counts = null_counts + + def _get_mem_usage(self, deep: bool) -> int: + raise NotImplementedError + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + raise NotImplementedError + + def _verbose_repr(self, lines, ids, dtypes, show_counts): + raise NotImplementedError + + def _non_verbose_repr(self, lines, ids): + raise NotImplementedError + + def get_info(self) -> None: + lines = [] + + lines.append(str(type(self.data))) + lines.append(self.data.index._summary()) + + ids, dtypes = self._get_ids_and_dtypes() + col_count = len(ids) + + if col_count == 0: + lines.append(f"Empty {type(self.data).__name__}") + fmt.buffer_put_lines(self.buf, lines) + return + + # hack + max_cols = self.max_cols + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self.data) + 1) + + if self.null_counts is None: + show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + else: + show_counts = self.null_counts + exceeds_info_cols = col_count > max_cols + + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + if self.verbose: + self._verbose_repr(lines, ids, dtypes, show_counts) + elif self.verbose is False: # specifically set to False, not necessarily None + self._non_verbose_repr(lines, ids) + else: + if exceeds_info_cols: + self._non_verbose_repr(lines, ids) + else: + self._verbose_repr(lines, ids, dtypes, show_counts) + + # groupby dtype.name to collect e.g. Categorical columns + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() + collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + if self.memory_usage: + # append memory usage of df to display + size_qualifier = "" + if self.memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self._get_mem_usage(deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(self.buf, lines) + + +class DataFrameInfo(Info): + def _get_mem_usage(self, deep: bool) -> int: + """ + Get DataFrame's memory usage in bytes. + + Parameters + ---------- + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + return self.data.memory_usage(index=True, deep=deep).sum() + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get DataFrame's column names and dtypes. + + Returns + ------- + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + return self.data.columns, self.data.dtypes + + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + """ + Display name, non-null count (optionally), and dtype for each column. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + dtypes : Series + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. + """ id_head = " # " column_head = "Column" col_space = 2 + col_count = len(ids) max_col = max(len(pprint_thing(k)) for k in ids) len_column = len(pprint_thing(column_head)) @@ -162,9 +255,14 @@ def _verbose_repr(): len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space + header = _put_str(id_head, space_num) + + lines.append(f"Data columns (total {col_count} columns):") + len_column = len(pprint_thing(column_head)) header = _put_str(id_head, space_num) + _put_str(column_head, space) + if show_counts: - counts = data.count() + counts = self.data.count() if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" @@ -213,46 +311,15 @@ def _verbose_repr(): + _put_str(dtype, space_dtype) ) - def _non_verbose_repr(): + def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ lines.append(ids._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not necessarily None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = data.memory_usage(index=True, deep=deep).sum() - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines) From c309aceb45dd2ecc0401286318d3d1360e221783 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:19:53 +0100 Subject: [PATCH 02/15] revert whatsnew change --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 872c2598cd9dd..92f7c0f6b59a3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -254,7 +254,6 @@ Other enhancements - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) -- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) From 773ef968c021ae3bb2f674087e5474525830989e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:26:09 +0100 Subject: [PATCH 03/15] revert duplicated header assignment --- pandas/io/formats/info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index f1409d3671832..efe850f105fb8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -255,8 +255,6 @@ def _verbose_repr( len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space - header = _put_str(id_head, space_num) - lines.append(f"Data columns (total {col_count} columns):") len_column = len(pprint_thing(column_head)) header = _put_str(id_head, space_num) + _put_str(column_head, space) From 40793bf6eb8bc99e034aeb0940d17f43ed8b9445 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:27:41 +0100 Subject: [PATCH 04/15] revert blank line --- pandas/io/formats/info.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index efe850f105fb8..76e37e49eef9c 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -256,9 +256,7 @@ def _verbose_repr( space_num = max(max_id, len_id) + col_space lines.append(f"Data columns (total {col_count} columns):") - len_column = len(pprint_thing(column_head)) header = _put_str(id_head, space_num) + _put_str(column_head, space) - if show_counts: counts = self.data.count() if col_count != len(counts): # pragma: no cover From 80e9376b9937a76b58dab6483ec5a2507462b7a9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:30:34 +0100 Subject: [PATCH 05/15] :art: --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 76e37e49eef9c..e3d499544bfd7 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -242,10 +242,11 @@ def _verbose_repr( show_counts : bool If True, count of non-NA cells for each column will be appended to `lines`. """ + col_count = len(ids) + lines.append(f"Data columns (total {col_count} columns):") id_head = " # " column_head = "Column" col_space = 2 - col_count = len(ids) max_col = max(len(pprint_thing(k)) for k in ids) len_column = len(pprint_thing(column_head)) @@ -255,7 +256,6 @@ def _verbose_repr( len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space - lines.append(f"Data columns (total {col_count} columns):") header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.data.count() From 9112631f4cc05a19944046f4614681c655550e3e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:31:25 +0100 Subject: [PATCH 06/15] :art: --- pandas/io/formats/info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e3d499544bfd7..eb598637e2901 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -244,6 +244,7 @@ def _verbose_repr( """ col_count = len(ids) lines.append(f"Data columns (total {col_count} columns):") + id_head = " # " column_head = "Column" col_space = 2 From 80c3b37dcfa8fe55fb3533fa49a00c9d92e317c3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:40:00 +0100 Subject: [PATCH 07/15] clarify docstring --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index eb598637e2901..1a25e8a15c190 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -229,7 +229,7 @@ def _verbose_repr( self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool ) -> None: """ - Display name, non-null count (optionally), and dtype for each column. + Append name, non-null count (optional), and dtype for each column to `lines`. Parameters ---------- From 8af274368b32cdb1851cef48e67dfd6367075fc4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 11:46:25 +0100 Subject: [PATCH 08/15] docsharing --- pandas/core/frame.py | 6 +-- pandas/io/formats/info.py | 105 +++++++++++++++++++------------------- 2 files changed, 55 insertions(+), 56 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 35863d6dc8ea4..b2fd94c77c751 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -140,7 +140,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import DataFrameInfo, Info +from pandas.io.formats.info import DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -2530,7 +2530,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(Info) + @doc(DataFrameInfo.info) def info( self, verbose: Optional[bool] = None, @@ -2541,7 +2541,7 @@ def info( ) -> None: return DataFrameInfo( self, verbose, buf, max_cols, memory_usage, null_counts - ).get_info() + ).info() def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 1a25e8a15c190..4a4c30e52f47a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -41,58 +41,6 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: class Info: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s - - Examples - -------- - %(examples_sub)s - """ - def __init__( self, data: FrameOrSeries, @@ -126,7 +74,58 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): def _non_verbose_repr(self, lines, ids): raise NotImplementedError - def get_info(self) -> None: + def info(self) -> None: + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ lines = [] lines.append(str(type(self.data))) From da80c56cb6512dc90074bdc194ffad2cdc38ae6d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 13 Jun 2020 20:54:16 +0100 Subject: [PATCH 09/15] use abc --- pandas/io/formats/info.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 4a4c30e52f47a..b830771f60c30 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,3 +1,4 @@ +from abc import ABCMeta, abstractmethod import sys from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union @@ -40,7 +41,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -class Info: +class Info(metaclass=ABCMeta): def __init__( self, data: FrameOrSeries, @@ -62,17 +63,21 @@ def __init__( self.memory_usage = memory_usage self.null_counts = null_counts + @abstractmethod def _get_mem_usage(self, deep: bool) -> int: - raise NotImplementedError + pass + @abstractmethod def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - raise NotImplementedError + pass + @abstractmethod def _verbose_repr(self, lines, ids, dtypes, show_counts): - raise NotImplementedError + pass + @abstractmethod def _non_verbose_repr(self, lines, ids): - raise NotImplementedError + pass def info(self) -> None: """ From 99c3042b37817c121cc5bcee3e49d0088fe0a1b0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 14 Jun 2020 09:33:16 +0100 Subject: [PATCH 10/15] BaseInfo --- pandas/io/formats/info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b830771f60c30..d0be68bf7f494 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -35,13 +35,13 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: -------- >>> pd.io.formats.info._put_str("panda", 6) 'panda ' - >>> pd.io.formats.info._put_str("panda", 4) + >>> pd.io.formats.info._pute_str("panda", 4) 'pand' """ return str(s)[:space].ljust(space) -class Info(metaclass=ABCMeta): +class BaseInfo(metaclass=ABCMeta): def __init__( self, data: FrameOrSeries, @@ -197,7 +197,7 @@ def _sizeof_fmt(num, size_qualifier): fmt.buffer_put_lines(self.buf, lines) -class DataFrameInfo(Info): +class DataFrameInfo(BaseInfo): def _get_mem_usage(self, deep: bool) -> int: """ Get DataFrame's memory usage in bytes. From 0ac85b011cce3b3fcf3c50f179723b1df535ef8c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 14 Jun 2020 11:23:28 +0100 Subject: [PATCH 11/15] typo in example --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d0be68bf7f494..7953d8f89599d 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -35,7 +35,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: -------- >>> pd.io.formats.info._put_str("panda", 6) 'panda ' - >>> pd.io.formats.info._pute_str("panda", 4) + >>> pd.io.formats.info._put_str("panda", 4) 'pand' """ return str(s)[:space].ljust(space) From 21ae291a5e0d8a70812e27bb97768c57569dc02a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 18 Jun 2020 08:19:51 +0100 Subject: [PATCH 12/15] move docstrings to base class --- pandas/io/formats/info.py | 137 ++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 57 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7953d8f89599d..0d8d380f0d472 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -41,6 +41,37 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) +def _sizeof_fmt(num, size_qualifier): + """ + Return size in human readable format. + + Parameters + ---------- + num : int + Size in bytes. + size_qualifier : str + Either empty, or '+' (if memory is lower bound). + + Returns + ------- + str + Size in human readable format. + + Examples + -------- + >>> _sizeof_fmt(23028, '') + '22.5 KB' + + >>> _sizeof_fmt(23028, '+') + '22.5+ KB' + """ + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + class BaseInfo(metaclass=ABCMeta): def __init__( self, @@ -65,18 +96,67 @@ def __init__( @abstractmethod def _get_mem_usage(self, deep: bool) -> int: + """ + Get memory usage in bytes. + + Parameters + ---------- + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ pass @abstractmethod def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get column names and dtypes. + + Returns + ------- + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. + """ pass @abstractmethod def _verbose_repr(self, lines, ids, dtypes, show_counts): + """ + Append name, non-null count (optional), and dtype for each column to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + dtypes : Series + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. + """ pass @abstractmethod def _non_verbose_repr(self, lines, ids): + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ pass def info(self) -> None: @@ -157,14 +237,6 @@ def info(self) -> None: show_counts = self.null_counts exceeds_info_cols = col_count > max_cols - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - if self.verbose: self._verbose_repr(lines, ids, dtypes, show_counts) elif self.verbose is False: # specifically set to False, not necessarily None @@ -199,53 +271,14 @@ def _sizeof_fmt(num, size_qualifier): class DataFrameInfo(BaseInfo): def _get_mem_usage(self, deep: bool) -> int: - """ - Get DataFrame's memory usage in bytes. - - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. - - Returns - ------- - mem_usage : int - Object's total memory usage in bytes. - """ return self.data.memory_usage(index=True, deep=deep).sum() def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - """ - Get DataFrame's column names and dtypes. - - Returns - ------- - ids : Index - DataFrame's column names. - dtypes : Series - Dtype of each of the DataFrame's columns. - """ return self.data.columns, self.data.dtypes def _verbose_repr( self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool ) -> None: - """ - Append name, non-null count (optional), and dtype for each column to `lines`. - - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - dtypes : Series - The DataFrame's columns' dtypes. - show_counts : bool - If True, count of non-NA cells for each column will be appended to `lines`. - """ col_count = len(ids) lines.append(f"Data columns (total {col_count} columns):") @@ -313,14 +346,4 @@ def _verbose_repr( ) def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: - """ - Append short summary of columns' names to `lines`. - - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - """ lines.append(ids._summary(name="Columns")) From b4573eba6453215bb7d29963a34fd96b71a0eb26 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 18 Jun 2020 08:22:17 +0100 Subject: [PATCH 13/15] type annotations --- pandas/io/formats/info.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0d8d380f0d472..7aa2f6199735f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -41,7 +41,7 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _sizeof_fmt(num, size_qualifier): +def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: """ Return size in human readable format. @@ -128,7 +128,9 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: pass @abstractmethod - def _verbose_repr(self, lines, ids, dtypes, show_counts): + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: """ Append name, non-null count (optional), and dtype for each column to `lines`. @@ -146,7 +148,7 @@ def _verbose_repr(self, lines, ids, dtypes, show_counts): pass @abstractmethod - def _non_verbose_repr(self, lines, ids): + def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: """ Append short summary of columns' names to `lines`. From dfc85fe102e5247bab7c15a1147f0234548e470c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 18 Jun 2020 08:27:01 +0100 Subject: [PATCH 14/15] reword docstring --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7aa2f6199735f..35907bc51d3ea 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -50,7 +50,7 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: num : int Size in bytes. size_qualifier : str - Either empty, or '+' (if memory is lower bound). + Either empty, or '+' (if lower bound). Returns ------- From 3330e8e61bf709f304fd2a818a7cfc44a39da444 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 18 Jun 2020 09:26:27 +0100 Subject: [PATCH 15/15] fix types --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 35907bc51d3ea..7a53b46a4ac0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -148,7 +148,7 @@ def _verbose_repr( pass @abstractmethod - def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: """ Append short summary of columns' names to `lines`. @@ -347,5 +347,5 @@ def _verbose_repr( + _put_str(dtype, space_dtype) ) - def _non_verbose_repr(self, lines: List[str], ids: "Series") -> None: + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: lines.append(ids._summary(name="Columns"))