From 4a1a2e7908e9cc129da230ef523a38ea90378078 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 30 Sep 2020 21:23:47 +0700 Subject: [PATCH 01/37] REF: polymorphism and builder pattern in info --- pandas/core/frame.py | 15 +- pandas/io/formats/info.py | 468 ++++++++++++++++++++++++-------------- 2 files changed, 308 insertions(+), 175 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9b2540a1ce043..ba6f776bd2e26 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2593,7 +2593,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(DataFrameInfo.info) + @doc(DataFrameInfo.to_buffer) def info( self, verbose: Optional[bool] = None, @@ -2602,9 +2602,16 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return DataFrameInfo( - self, verbose, buf, max_cols, memory_usage, null_counts - ).info() + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.to_buffer( + buf=buf, + max_cols=max_cols, + verbose=verbose, + null_counts=null_counts, + ) def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e8e41d4325103..76b723af62b1b 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,19 +1,17 @@ -from abc import ABCMeta, abstractmethod +from abc import abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Union from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index +from pandas.core.series import Series from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing -if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 - def _put_str(s: Union[str, Dtype], space: int) -> str: """ @@ -72,92 +70,88 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -class BaseInfo(metaclass=ABCMeta): +class DataFrameInfo: def __init__( self, data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, ): - if buf is None: # pragma: no cover - buf = sys.stdout - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - self.data = data - self.verbose = verbose - self.buf = buf - self.max_cols = max_cols - self.memory_usage = memory_usage - self.null_counts = null_counts + self.memory_usage = self._initialize_memory_usage(memory_usage) - @abstractmethod - def _get_mem_usage(self, deep: bool) -> int: - """ - Get memory usage in bytes. + def _initialize_memory_usage( + self, + memory_usage: Optional[Union[bool, str]] = None, + ) -> Union[bool, str]: + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. + @property + def ids(self) -> Index: + """Column names. Returns ------- - mem_usage : int - Object's total memory usage in bytes. + ids : Index + DataFrame's column names. """ + return self.data.columns - @abstractmethod - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - """ - Get column names and dtypes. + @property + def dtypes(self) -> Series: + """Dtypes. Returns ------- - ids : Index - DataFrame's column names. dtypes : Series Dtype of each of the DataFrame's columns. """ + return self.data.dtypes - @abstractmethod - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - """ - Append name, non-null count (optional), and dtype for each column to `lines`. + @property + def mem_usage(self) -> int: + """Memory usage in bytes. Parameters ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - dtypes : Series - The DataFrame's columns' dtypes. - show_counts : bool - If True, count of non-NA cells for each column will be appended to `lines`. - """ + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. - @abstractmethod - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. """ - Append short summary of columns' names to `lines`. + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - """ + @property + def size_qualifier(self) -> str: + size_qualifier = "" + if self.memory_usage: + if self.memory_usage != "deep": + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + if ( + "object" in self.counts + or self.data.index._is_memory_usage_qualified() + ): + size_qualifier = "+" + return size_qualifier - def info(self) -> None: + @property + def counts(self): + # groupby dtype.name to collect e.g. Categorical columns + return self.dtypes.value_counts().groupby(lambda x: x.name).sum() + + def to_buffer(self, *, buf, max_cols, verbose, null_counts) -> None: """ Print a concise summary of a %(klass)s. @@ -209,147 +203,279 @@ def info(self) -> None: -------- %(examples_sub)s """ - lines = [] + printer = InfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + null_counts=null_counts, + ) + printer.to_buffer(buf) - lines.append(str(type(self.data))) - lines.append(self.data.index._summary()) - ids, dtypes = self._get_ids_and_dtypes() - col_count = len(ids) +class InfoPrinter: + def __init__( + self, + info: DataFrameInfo, + max_cols: Optional[int] = None, + verbose: Optional[bool] = None, + null_counts: Optional[bool] = None, + ): + self.info = info + self.data = info.data + self.max_cols = max_cols + self.verbose = verbose + self.null_counts = null_counts - if col_count == 0: - lines.append(f"Empty {type(self.data).__name__}") - fmt.buffer_put_lines(self.buf, lines) - return + @property + def max_cols(self): + return self._max_cols + @max_cols.setter + def max_cols(self, max_cols): # hack - max_cols = self.max_cols if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) + max_cols = get_option("display.max_info_columns", self.col_count + 1) + self._max_cols = max_cols - max_rows = get_option("display.max_info_rows", len(self.data) + 1) + @property + def max_rows(self): + return get_option("display.max_info_rows", len(self.data) + 1) + @property + def exceeds_info_cols(self): + return self.col_count > self.max_cols + + @property + def show_counts(self) -> bool: if self.null_counts is None: - show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + return bool( + (self.col_count <= self.max_cols) and (len(self.data) < self.max_rows) + ) else: - show_counts = self.null_counts - exceeds_info_cols = col_count > max_cols + return self.null_counts + + @property + def col_count(self): + return len(self.info.ids) + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + klass = self._select_table_builder() + table_builder = klass(info=self.info, printer=self) + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + + def _select_table_builder(self): if self.verbose: - self._verbose_repr(lines, ids, dtypes, show_counts) + return self._select_verbose_table_builder() elif self.verbose is False: # specifically set to False, not necessarily None - self._non_verbose_repr(lines, ids) + return TableBuilderNonVerbose else: - if exceeds_info_cols: - self._non_verbose_repr(lines, ids) + if self.exceeds_info_cols: + return TableBuilderNonVerbose else: - self._verbose_repr(lines, ids, dtypes, show_counts) + return self._select_verbose_table_builder() - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") + def _select_verbose_table_builder(self): + if self.show_counts: + return TableBuilderVerboseWithCounts + else: + return TableBuilderVerboseNoCounts + + +class TableBuilderAbstract: + _lines: List[str] + def __init__(self, *, info, printer): + self.info = info + self.printer = printer + + def get_lines(self): + self._lines = [] + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() if self.memory_usage: - # append memory usage of df to display - size_qualifier = "" - if self.memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self._get_mem_usage(deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(self.buf, lines) + self.add_memory_usage_line() + return self._lines + @property + def data(self): + return self.info.data -class DataFrameInfo(BaseInfo): - def _get_mem_usage(self, deep: bool) -> int: - return self.data.memory_usage(index=True, deep=deep).sum() + @property + def counts(self): + return self.info.counts - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return self.data.columns, self.data.dtypes + @property + def memory_usage(self): + return self.info.memory_usage - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - col_count = len(ids) - lines.append(f"Data columns (total {col_count} columns):") + @property + def ids(self): + return self.info.ids - id_head = " # " - column_head = "Column" - col_space = 2 + @property + def dtypes(self): + return self.info.dtypes - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space + @property + def show_counts(self): + return self.printer.show_counts - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space + @property + def col_count(self): + return self.printer.col_count - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" + def add_object_type_line(self): + self._lines.append(str(type(self.data))) + + def add_index_range_line(self): + self._lines.append(self.data.index._summary()) + + @abstractmethod + def add_columns_summary_line(self): + pass + + @abstractmethod + def add_header_line(self): + pass + + @abstractmethod + def add_separator_line(self): + pass + + def add_body_lines(self): + if self.col_count == 0: + self._lines.append(f"Empty {type(self.data).__name__}") + + def add_dtypes_line(self): + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + def add_memory_usage_line(self): + self._lines.append( + "memory usage: " + f"{_sizeof_fmt(self.info.mem_usage, self.info.size_qualifier)}\n" + ) + + +class TableBuilderNonVerbose(TableBuilderAbstract): + def add_columns_summary_line(self): + self._lines.append(self.ids._summary(name="Columns")) + + def add_header_line(self): + pass + + def add_separator_line(self): + pass - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header = "".join( +class TableBuilderVerbose(TableBuilderAbstract): + COL_SPACE = 2 + SPACING = " " * COL_SPACE + HEADERS: List[str] + + def __init__(self, *, info, printer): + super().__init__(info=info, printer=printer) + self.strcols: List[List[str]] = self._get_strcols() + + @abstractmethod + def _get_strcols(self) -> List[List[str]]: + pass + + def add_columns_summary_line(self): + self._lines.append(f"Data columns (total {self.col_count} columns):") + + @property + def header_column_widths(self): + return [len(col) for col in self.HEADERS] + + @property + def body_column_widths(self): + return [max(len(x) for x in col) for col in self.strcols] + + @property + def gross_column_widths(self): + return [ + max(header_colwidth, body_colwidth) + for header_colwidth, body_colwidth in zip( + self.header_column_widths, self.body_column_widths + ) + ] + + def add_header_line(self): + header_line = self.SPACING.join( [ - _put_str(id_head, space_num), - _put_str(column_head, space), - _put_str(count_header, space_count), - _put_str(dtype_header, space_dtype), + _put_str(header, col_width) + for header, col_width in zip(self.HEADERS, self.gross_column_widths) ] ) - lines.append(header) + self._lines.append(header_line) - top_separator = "".join( + def add_separator_line(self): + separator_line = self.SPACING.join( [ - _put_str("-" * len_id, space_num), - _put_str("-" * len_column, space), - _put_str("-" * len_count, space_count), - _put_str("-" * len_dtype, space_dtype), + _put_str("-" * header_colwidth, gross_colwidth) + for header_colwidth, gross_colwidth in zip( + self.header_column_widths, self.gross_column_widths + ) ] ) - lines.append(top_separator) - - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) + self._lines.append(separator_line) + + def add_body_lines(self): + strrows = list(zip(*self.strcols)) + for row in strrows: + body_line = self.SPACING.join( + [ + _put_str(col, gross_colwidth) + for col, gross_colwidth in zip(row, self.gross_column_widths) + ] ) + self._lines.append(body_line) + + +class TableBuilderVerboseNoCounts(TableBuilderVerbose): + + HEADERS = [ + " # ", + "Column", + "Dtype", + ] + + def _get_strcols(self) -> List[List[str]]: + line_numbers = [f" {i}" for i, _ in enumerate(self.ids)] + columns = [pprint_thing(col) for col in self.ids] + dtypes = [pprint_thing(dtype) for dtype in self.dtypes] + return [line_numbers, columns, dtypes] + + +class TableBuilderVerboseWithCounts(TableBuilderVerbose): + + HEADERS = [ + " # ", + "Column", + "Non-Null Count", + "Dtype", + ] + + @property + def count_non_null(self): + return "{count} non-null" - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - lines.append(ids._summary(name="Columns")) + def _get_strcols(self) -> List[List[str]]: + line_numbers = [f" {i}" for i, _ in enumerate(self.ids)] + columns = [pprint_thing(col) for col in self.ids] + non_null_counts = [ + self.count_non_null.format(count=count) for count in self.data.count() + ] + dtypes = [pprint_thing(dtype) for dtype in self.dtypes] + return [line_numbers, columns, non_null_counts, dtypes] From a3ccb832f1cc6e25400bced4355b1a96a992d500 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 04:08:04 +0700 Subject: [PATCH 02/37] TST: adjust expected gap between # and Column --- pandas/tests/io/formats/test_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 7000daeb9b575..15ffaac10338b 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -87,7 +87,7 @@ def test_info_verbose(): frame.info(verbose=True, buf=buf) res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " + header = " # Column Dtype \n--- ------ ----- " assert header in res frame.info(verbose=True, buf=buf) From a5e11365c58d16f576caa791db001fd53957ecf1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 04:36:49 +0700 Subject: [PATCH 03/37] LINT: remove TYPE_CHECKING import For some reason I got import error when importing Series under TYPE_CHECKING only. So, now I import Series normally and delete TYPE_CHECKING --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 76b723af62b1b..974933b365972 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ from abc import abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Union +from typing import IO, List, Optional, Union from pandas._config import get_option From 401d7d8128ea2688f5bb84879b36cab3010124a2 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 10:55:21 +0700 Subject: [PATCH 04/37] TST: add test for empty dataframe info --- pandas/tests/io/formats/test_info.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 15ffaac10338b..fd3fa07d0b15c 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -51,6 +51,20 @@ def datetime_frame(): return DataFrame(tm.getTimeSeriesData()) +def test_info_empty(): + df = DataFrame() + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 0 entries + Empty DataFrame""" + ) + assert result == expected + + def test_info_categorical_column(): # make sure it works From 6de3eb7ba496b9f36f798bfffb294ae15cb0e8b2 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 10:55:38 +0700 Subject: [PATCH 05/37] FIX: handle empty dataframe case --- pandas/io/formats/info.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 974933b365972..75792160c8a32 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -293,6 +293,18 @@ def __init__(self, *, info, printer): def get_lines(self): self._lines = [] + if self.col_count == 0: + self._fill_empty_info() + else: + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self): + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}") + + def _fill_non_empty_info(self): self.add_object_type_line() self.add_index_range_line() self.add_columns_summary_line() @@ -302,7 +314,6 @@ def get_lines(self): self.add_dtypes_line() if self.memory_usage: self.add_memory_usage_line() - return self._lines @property def data(self): @@ -350,9 +361,9 @@ def add_header_line(self): def add_separator_line(self): pass + @abstractmethod def add_body_lines(self): - if self.col_count == 0: - self._lines.append(f"Empty {type(self.data).__name__}") + pass def add_dtypes_line(self): collected_dtypes = [ @@ -377,6 +388,9 @@ def add_header_line(self): def add_separator_line(self): pass + def add_body_lines(self): + pass + class TableBuilderVerbose(TableBuilderAbstract): COL_SPACE = 2 From 82f9ddcc68c8624fe15974dfa7f3189b782030c7 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 12:48:04 +0700 Subject: [PATCH 06/37] REF: extract abstract cls BaseInfo and builder --- pandas/io/formats/info.py | 91 +++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 75792160c8a32..3eae22c8115e6 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,4 +1,4 @@ -from abc import abstractmethod +from abc import ABC, abstractmethod import sys from typing import IO, List, Optional, Union @@ -70,7 +70,17 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -class DataFrameInfo: +class BaseInfo(ABC): + """Base class for DataFrameInfo and SeriesInfo. + + Parameters + ---------- + data : FrameOrSeries + Either dataframe or series. + memory_usage : bool + + """ + def __init__( self, data: FrameOrSeries, @@ -88,17 +98,12 @@ def _initialize_memory_usage( return memory_usage @property + @abstractmethod def ids(self) -> Index: - """Column names. - - Returns - ------- - ids : Index - DataFrame's column names. - """ - return self.data.columns + pass @property + @abstractmethod def dtypes(self) -> Series: """Dtypes. @@ -146,6 +151,32 @@ def size_qualifier(self) -> str: size_qualifier = "+" return size_qualifier + +class DataFrameInfo(BaseInfo): + """Class storing dataframe-specific info.""" + + @property + def ids(self) -> Index: + """Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def dtypes(self) -> Series: + """Dtypes. + + Returns + ------- + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + @property def counts(self): # groupby dtype.name to collect e.g. Categorical columns @@ -213,6 +244,8 @@ def to_buffer(self, *, buf, max_cols, verbose, null_counts) -> None: class InfoPrinter: + """Class for printing dataframe or series info.""" + def __init__( self, info: DataFrameInfo, @@ -270,21 +303,23 @@ def _select_table_builder(self): if self.verbose: return self._select_verbose_table_builder() elif self.verbose is False: # specifically set to False, not necessarily None - return TableBuilderNonVerbose + return DataFrameTableBuilderNonVerbose else: if self.exceeds_info_cols: - return TableBuilderNonVerbose + return DataFrameTableBuilderNonVerbose else: return self._select_verbose_table_builder() def _select_verbose_table_builder(self): if self.show_counts: - return TableBuilderVerboseWithCounts + return DataFrameTableBuilderVerboseWithCounts else: - return TableBuilderVerboseNoCounts + return DataFrameTableBuilderVerboseNoCounts -class TableBuilderAbstract: +class TableBuilderAbstract(ABC): + """Abstract builder for info table.""" + _lines: List[str] def __init__(self, *, info, printer): @@ -299,6 +334,18 @@ def get_lines(self): self._fill_non_empty_info() return self._lines + @abstractmethod + def _fill_empty_info(self): + pass + + @abstractmethod + def _fill_non_empty_info(self): + pass + + +class DataFrameTableBuilder(TableBuilderAbstract): + """Abstract builder for dataframe info table.""" + def _fill_empty_info(self): self.add_object_type_line() self.add_index_range_line() @@ -378,7 +425,9 @@ def add_memory_usage_line(self): ) -class TableBuilderNonVerbose(TableBuilderAbstract): +class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): + """Info table builder for non-verbose output.""" + def add_columns_summary_line(self): self._lines.append(self.ids._summary(name="Columns")) @@ -392,7 +441,9 @@ def add_body_lines(self): pass -class TableBuilderVerbose(TableBuilderAbstract): +class DataFrameTableBuilderVerbose(DataFrameTableBuilder): + """Info table builder for verbose output.""" + COL_SPACE = 2 SPACING = " " * COL_SPACE HEADERS: List[str] @@ -457,7 +508,8 @@ def add_body_lines(self): self._lines.append(body_line) -class TableBuilderVerboseNoCounts(TableBuilderVerbose): +class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): + """Verbose info table builder without non-null counts column.""" HEADERS = [ " # ", @@ -472,7 +524,8 @@ def _get_strcols(self) -> List[List[str]]: return [line_numbers, columns, dtypes] -class TableBuilderVerboseWithCounts(TableBuilderVerbose): +class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): + """Verbose info table builder with non-null counts column.""" HEADERS = [ " # ", From 75d65fb5dc85037e0863eeb035b71c73f7bc129d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 12:57:25 +0700 Subject: [PATCH 07/37] DOC: fix docstrings --- pandas/io/formats/info.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 3eae22c8115e6..ba5810e8e8216 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -77,8 +77,10 @@ class BaseInfo(ABC): ---------- data : FrameOrSeries Either dataframe or series. - memory_usage : bool - + memory_usage : bool or str, optional + If "deep", introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. """ def __init__( @@ -118,13 +120,6 @@ def dtypes(self) -> Series: def mem_usage(self) -> int: """Memory usage in bytes. - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. - Returns ------- mem_usage : int From 25d6ac828e3abee71df9a7d2194ee5a95e441752 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 13:34:49 +0700 Subject: [PATCH 08/37] REF: de-duplicate line_numbers, columns, dtypes --- pandas/io/formats/info.py | 49 +++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index ba5810e8e8216..913159b614a8d 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -104,6 +104,11 @@ def _initialize_memory_usage( def ids(self) -> Index: pass + @property + @abstractmethod + def counts(self): + pass + @property @abstractmethod def dtypes(self) -> Series: @@ -321,6 +326,14 @@ def __init__(self, *, info, printer): self.info = info self.printer = printer + @abstractmethod + def get_lines(self): + pass + + +class DataFrameTableBuilder(TableBuilderAbstract): + """Abstract builder for dataframe info table.""" + def get_lines(self): self._lines = [] if self.col_count == 0: @@ -329,18 +342,6 @@ def get_lines(self): self._fill_non_empty_info() return self._lines - @abstractmethod - def _fill_empty_info(self): - pass - - @abstractmethod - def _fill_non_empty_info(self): - pass - - -class DataFrameTableBuilder(TableBuilderAbstract): - """Abstract builder for dataframe info table.""" - def _fill_empty_info(self): self.add_object_type_line() self.add_index_range_line() @@ -502,6 +503,18 @@ def add_body_lines(self): ) self._lines.append(body_line) + def _get_line_numbers(self): + for i, _ in enumerate(self.ids): + yield f" {i}" + + def _get_columns(self): + for col in self.ids: + yield pprint_thing(col) + + def _get_dtypes(self): + for dtype in self.dtypes: + yield pprint_thing(dtype) + class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): """Verbose info table builder without non-null counts column.""" @@ -513,9 +526,9 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): ] def _get_strcols(self) -> List[List[str]]: - line_numbers = [f" {i}" for i, _ in enumerate(self.ids)] - columns = [pprint_thing(col) for col in self.ids] - dtypes = [pprint_thing(dtype) for dtype in self.dtypes] + line_numbers = list(self._get_line_numbers()) + columns = list(self._get_columns()) + dtypes = list(self._get_dtypes()) return [line_numbers, columns, dtypes] @@ -534,10 +547,10 @@ def count_non_null(self): return "{count} non-null" def _get_strcols(self) -> List[List[str]]: - line_numbers = [f" {i}" for i, _ in enumerate(self.ids)] - columns = [pprint_thing(col) for col in self.ids] + line_numbers = list(self._get_line_numbers()) + columns = list(self._get_columns()) + dtypes = list(self._get_dtypes()) non_null_counts = [ self.count_non_null.format(count=count) for count in self.data.count() ] - dtypes = [pprint_thing(dtype) for dtype in self.dtypes] return [line_numbers, columns, non_null_counts, dtypes] From 7a861dc22e518eae094f875c7275a2cb2c72aa5f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 13:38:01 +0700 Subject: [PATCH 09/37] CLN: make HEADERS one-liners --- pandas/io/formats/info.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 913159b614a8d..afc5dd6042c13 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -519,11 +519,7 @@ def _get_dtypes(self): class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): """Verbose info table builder without non-null counts column.""" - HEADERS = [ - " # ", - "Column", - "Dtype", - ] + HEADERS = [" # ", "Column", "Dtype"] def _get_strcols(self) -> List[List[str]]: line_numbers = list(self._get_line_numbers()) @@ -535,12 +531,7 @@ def _get_strcols(self) -> List[List[str]]: class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): """Verbose info table builder with non-null counts column.""" - HEADERS = [ - " # ", - "Column", - "Non-Null Count", - "Dtype", - ] + HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] @property def count_non_null(self): From 8640fb97600b3a7cae662f9a4a582f499de8ef38 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 13:39:06 +0700 Subject: [PATCH 10/37] REF: make _initialize_memory_usage staticmethod --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index afc5dd6042c13..0abb403bee0ea 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -91,8 +91,8 @@ def __init__( self.data = data self.memory_usage = self._initialize_memory_usage(memory_usage) + @staticmethod def _initialize_memory_usage( - self, memory_usage: Optional[Union[bool, str]] = None, ) -> Union[bool, str]: if memory_usage is None: From cd676b0b7a85161b285b3170f3fa66383362d678 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 19:01:30 +0700 Subject: [PATCH 11/37] DOC/TYP: add docstrings and type annotations --- pandas/io/formats/info.py | 176 ++++++++++++++++++++++++-------------- 1 file changed, 113 insertions(+), 63 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0abb403bee0ea..a2712dff6bcfe 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,17 +1,29 @@ from abc import ABC, abstractmethod import sys -from typing import IO, List, Optional, Union +from typing import ( + IO, + TYPE_CHECKING, + Iterator, + List, + Mapping, + Optional, + Sequence, + Type, + Union, +) from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index -from pandas.core.series import Series from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing +if TYPE_CHECKING: + from pandas import DataFrame, Series + def _put_str(s: Union[str, Dtype], space: int) -> str: """ @@ -106,12 +118,12 @@ def ids(self) -> Index: @property @abstractmethod - def counts(self): + def counts(self) -> Mapping[str, int]: pass @property @abstractmethod - def dtypes(self) -> Series: + def dtypes(self) -> "Series": """Dtypes. Returns @@ -167,7 +179,7 @@ def ids(self) -> Index: return self.data.columns @property - def dtypes(self) -> Series: + def dtypes(self) -> "Series": """Dtypes. Returns @@ -178,7 +190,7 @@ def dtypes(self) -> Series: return self.data.dtypes @property - def counts(self): + def counts(self) -> Mapping[str, int]: # groupby dtype.name to collect e.g. Categorical columns return self.dtypes.value_counts().groupby(lambda x: x.name).sum() @@ -244,7 +256,19 @@ def to_buffer(self, *, buf, max_cols, verbose, null_counts) -> None: class InfoPrinter: - """Class for printing dataframe or series info.""" + """Class for printing dataframe or series info. + + Parameters + ---------- + info : DataFrameInfo + Instance of DataFrameInfo. + max_cols : int, optional + When to switch from the verbose to the truncated output. + verbose : bool, optional + Whether to print the full summary. + null_counts : bool, optional + Whether to show the non-null counts. + """ def __init__( self, @@ -271,12 +295,12 @@ def max_cols(self, max_cols): self._max_cols = max_cols @property - def max_rows(self): + def max_rows(self) -> int: return get_option("display.max_info_rows", len(self.data) + 1) @property - def exceeds_info_cols(self): - return self.col_count > self.max_cols + def exceeds_info_cols(self) -> bool: + return bool(self.col_count > self.max_cols) @property def show_counts(self) -> bool: @@ -288,7 +312,7 @@ def show_counts(self) -> bool: return self.null_counts @property - def col_count(self): + def col_count(self) -> int: return len(self.info.ids) def to_buffer(self, buf: Optional[IO[str]] = None) -> None: @@ -299,7 +323,7 @@ def to_buffer(self, buf: Optional[IO[str]] = None) -> None: buf = sys.stdout fmt.buffer_put_lines(buf, lines) - def _select_table_builder(self): + def _select_table_builder(self) -> Type["DataFrameTableBuilder"]: if self.verbose: return self._select_verbose_table_builder() elif self.verbose is False: # specifically set to False, not necessarily None @@ -310,7 +334,7 @@ def _select_table_builder(self): else: return self._select_verbose_table_builder() - def _select_verbose_table_builder(self): + def _select_verbose_table_builder(self) -> Type["DataFrameTableBuilderVerbose"]: if self.show_counts: return DataFrameTableBuilderVerboseWithCounts else: @@ -318,7 +342,15 @@ def _select_verbose_table_builder(self): class TableBuilderAbstract(ABC): - """Abstract builder for info table.""" + """Abstract builder for info table. + + Parameters + ---------- + info : BaseInfo + Instance of DataFrameInfo or SeriesInfo. + printer : InfoPrinter + Instance of InfoPrinter. + """ _lines: List[str] @@ -327,14 +359,14 @@ def __init__(self, *, info, printer): self.printer = printer @abstractmethod - def get_lines(self): - pass + def get_lines(self) -> List[str]: + """Product in a form of list of lines (strings).""" class DataFrameTableBuilder(TableBuilderAbstract): """Abstract builder for dataframe info table.""" - def get_lines(self): + def get_lines(self) -> List[str]: self._lines = [] if self.col_count == 0: self._fill_empty_info() @@ -342,12 +374,14 @@ def get_lines(self): self._fill_non_empty_info() return self._lines - def _fill_empty_info(self): + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" self.add_object_type_line() self.add_index_range_line() self._lines.append(f"Empty {type(self.data).__name__}") - def _fill_non_empty_info(self): + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" self.add_object_type_line() self.add_index_range_line() self.add_columns_summary_line() @@ -355,66 +389,72 @@ def _fill_non_empty_info(self): self.add_separator_line() self.add_body_lines() self.add_dtypes_line() - if self.memory_usage: + if self.display_memory_usage: self.add_memory_usage_line() @property - def data(self): + def data(self) -> "DataFrame": + """DataFrame.""" return self.info.data @property - def counts(self): + def counts(self) -> Mapping[str, int]: + """Mapping column - number of counts.""" return self.info.counts @property - def memory_usage(self): + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" return self.info.memory_usage @property - def ids(self): + def ids(self) -> Index: + """Dataframe columns.""" return self.info.ids @property - def dtypes(self): + def dtypes(self) -> "Series": + """Dtypes of each of the DataFrame's columns.""" return self.info.dtypes @property - def show_counts(self): - return self.printer.show_counts - - @property - def col_count(self): + def col_count(self) -> int: + """Number of dataframe columns.""" return self.printer.col_count - def add_object_type_line(self): + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" self._lines.append(str(type(self.data))) - def add_index_range_line(self): + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" self._lines.append(self.data.index._summary()) @abstractmethod - def add_columns_summary_line(self): - pass + def add_columns_summary_line(self) -> None: + """Add line with columns summary to the table.""" @abstractmethod - def add_header_line(self): - pass + def add_header_line(self) -> None: + """Add header line to the table.""" @abstractmethod - def add_separator_line(self): - pass + def add_separator_line(self) -> None: + """Add separator line between header and body of the table.""" @abstractmethod - def add_body_lines(self): - pass + def add_body_lines(self) -> None: + """Add content of the table body.""" - def add_dtypes_line(self): + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" collected_dtypes = [ f"{key}({val:d})" for key, val in sorted(self.counts.items()) ] self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") - def add_memory_usage_line(self): + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" self._lines.append( "memory usage: " f"{_sizeof_fmt(self.info.mem_usage, self.info.size_qualifier)}\n" @@ -424,16 +464,16 @@ def add_memory_usage_line(self): class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): """Info table builder for non-verbose output.""" - def add_columns_summary_line(self): + def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) - def add_header_line(self): + def add_header_line(self) -> None: pass - def add_separator_line(self): + def add_separator_line(self) -> None: pass - def add_body_lines(self): + def add_body_lines(self) -> None: pass @@ -442,29 +482,35 @@ class DataFrameTableBuilderVerbose(DataFrameTableBuilder): COL_SPACE = 2 SPACING = " " * COL_SPACE - HEADERS: List[str] + HEADERS: Sequence[str] def __init__(self, *, info, printer): super().__init__(info=info, printer=printer) - self.strcols: List[List[str]] = self._get_strcols() + self.strcols: Sequence[Sequence[str]] = self._get_strcols() @abstractmethod - def _get_strcols(self) -> List[List[str]]: - pass + def _get_strcols(self) -> Sequence[Sequence[str]]: + """Get columns content. + + Each element of the list represents a column data (list of rows). + """ - def add_columns_summary_line(self): + def add_columns_summary_line(self) -> None: self._lines.append(f"Data columns (total {self.col_count} columns):") @property - def header_column_widths(self): + def header_column_widths(self) -> Sequence[int]: + """Widths of header columns (only titles).""" return [len(col) for col in self.HEADERS] @property - def body_column_widths(self): + def body_column_widths(self) -> Sequence[int]: + """Widths of table content columns.""" return [max(len(x) for x in col) for col in self.strcols] @property - def gross_column_widths(self): + def gross_column_widths(self) -> Sequence[int]: + """Widths of columns containing both headers and actual content.""" return [ max(header_colwidth, body_colwidth) for header_colwidth, body_colwidth in zip( @@ -472,7 +518,7 @@ def gross_column_widths(self): ) ] - def add_header_line(self): + def add_header_line(self) -> None: header_line = self.SPACING.join( [ _put_str(header, col_width) @@ -481,7 +527,7 @@ def add_header_line(self): ) self._lines.append(header_line) - def add_separator_line(self): + def add_separator_line(self) -> None: separator_line = self.SPACING.join( [ _put_str("-" * header_colwidth, gross_colwidth) @@ -492,7 +538,7 @@ def add_separator_line(self): ) self._lines.append(separator_line) - def add_body_lines(self): + def add_body_lines(self) -> None: strrows = list(zip(*self.strcols)) for row in strrows: body_line = self.SPACING.join( @@ -503,15 +549,18 @@ def add_body_lines(self): ) self._lines.append(body_line) - def _get_line_numbers(self): + def _get_line_numbers(self) -> Iterator[str]: + """Iterator with string representation of column numbers.""" for i, _ in enumerate(self.ids): yield f" {i}" - def _get_columns(self): + def _get_columns(self) -> Iterator[str]: + """Iterator with string representation of column names.""" for col in self.ids: yield pprint_thing(col) - def _get_dtypes(self): + def _get_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" for dtype in self.dtypes: yield pprint_thing(dtype) @@ -521,7 +570,7 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Dtype"] - def _get_strcols(self) -> List[List[str]]: + def _get_strcols(self) -> Sequence[Sequence[str]]: line_numbers = list(self._get_line_numbers()) columns = list(self._get_columns()) dtypes = list(self._get_dtypes()) @@ -534,10 +583,11 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] @property - def count_non_null(self): + def count_non_null(self) -> str: + """String representation of non-null count column data.""" return "{count} non-null" - def _get_strcols(self) -> List[List[str]]: + def _get_strcols(self) -> Sequence[Sequence[str]]: line_numbers = list(self._get_line_numbers()) columns = list(self._get_columns()) dtypes = list(self._get_dtypes()) From 5ce8a7209d2539d4aab30113011b7caed414cf8c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 1 Oct 2020 19:10:45 +0700 Subject: [PATCH 12/37] REF: create memory usage string in BaseInfo --- pandas/io/formats/info.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a2712dff6bcfe..8b5deb886c16c 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -148,6 +148,10 @@ def mem_usage(self) -> int: deep = False return self.data.memory_usage(index=True, deep=deep).sum() + @property + def memory_usage_string(self) -> str: + return f"{_sizeof_fmt(self.mem_usage, self.size_qualifier)}\n" + @property def size_qualifier(self) -> str: size_qualifier = "" @@ -316,6 +320,7 @@ def col_count(self) -> int: return len(self.info.ids) def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" klass = self._select_table_builder() table_builder = klass(info=self.info, printer=self) lines = table_builder.get_lines() @@ -407,6 +412,11 @@ def display_memory_usage(self) -> bool: """Whether to display memory usage.""" return self.info.memory_usage + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + @property def ids(self) -> Index: """Dataframe columns.""" @@ -455,10 +465,7 @@ def add_dtypes_line(self) -> None: def add_memory_usage_line(self) -> None: """Add line containing memory usage.""" - self._lines.append( - "memory usage: " - f"{_sizeof_fmt(self.info.mem_usage, self.info.size_qualifier)}\n" - ) + self._lines.append(f"memory usage: {self.memory_usage_string}") class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): From e0a203586dc3211eb285e36c88fdd303e078db9a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 14:48:35 +0700 Subject: [PATCH 13/37] TYP: annotate DataFrameInfo.to_buffer --- pandas/io/formats/info.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 8b5deb886c16c..d16cadd09377f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -198,7 +198,14 @@ def counts(self) -> Mapping[str, int]: # groupby dtype.name to collect e.g. Categorical columns return self.dtypes.value_counts().groupby(lambda x: x.name).sum() - def to_buffer(self, *, buf, max_cols, verbose, null_counts) -> None: + def to_buffer( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + null_counts: Optional[bool], + ) -> None: """ Print a concise summary of a %(klass)s. From 3010d6ccfc96a256870c14b6f78812cc8b852509 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:17:48 +0700 Subject: [PATCH 14/37] REF: rename kwarg null_counts -> show_counts Changes concern internal implementation only. Public API remains the same. --- pandas/core/frame.py | 2 +- pandas/io/formats/info.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba6f776bd2e26..c4cb6ad22fb24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2610,7 +2610,7 @@ def info( buf=buf, max_cols=max_cols, verbose=verbose, - null_counts=null_counts, + show_counts=null_counts, ) def memory_usage(self, index=True, deep=False) -> Series: diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d16cadd09377f..e72906d8615c0 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -204,7 +204,7 @@ def to_buffer( buf: Optional[IO[str]], max_cols: Optional[int], verbose: Optional[bool], - null_counts: Optional[bool], + show_counts: Optional[bool], ) -> None: """ Print a concise summary of a %(klass)s. @@ -261,7 +261,7 @@ def to_buffer( info=self, max_cols=max_cols, verbose=verbose, - null_counts=null_counts, + show_counts=show_counts, ) printer.to_buffer(buf) @@ -277,7 +277,7 @@ class InfoPrinter: When to switch from the verbose to the truncated output. verbose : bool, optional Whether to print the full summary. - null_counts : bool, optional + show_counts : bool, optional Whether to show the non-null counts. """ @@ -286,13 +286,13 @@ def __init__( info: DataFrameInfo, max_cols: Optional[int] = None, verbose: Optional[bool] = None, - null_counts: Optional[bool] = None, + show_counts: Optional[bool] = None, ): self.info = info self.data = info.data self.max_cols = max_cols self.verbose = verbose - self.null_counts = null_counts + self.show_counts = self._initialize_show_counts(show_counts) @property def max_cols(self): @@ -313,14 +313,13 @@ def max_rows(self) -> int: def exceeds_info_cols(self) -> bool: return bool(self.col_count > self.max_cols) - @property - def show_counts(self) -> bool: - if self.null_counts is None: + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: return bool( (self.col_count <= self.max_cols) and (len(self.data) < self.max_rows) ) else: - return self.null_counts + return show_counts @property def col_count(self) -> int: From 238f091c373cac3f9e6ef6a7640d38eafa431bb6 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:21:33 +0700 Subject: [PATCH 15/37] CLN: reuse exceeds_info_cols in show_counts --- pandas/io/formats/info.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e72906d8615c0..8ede3a5c19c71 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -315,9 +315,7 @@ def exceeds_info_cols(self) -> bool: def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: if show_counts is None: - return bool( - (self.col_count <= self.max_cols) and (len(self.data) < self.max_rows) - ) + return bool(not self.exceeds_info_cols and (len(self.data) < self.max_rows)) else: return show_counts From ffeff499962b19487b96eae483743eadc451aab3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:23:26 +0700 Subject: [PATCH 16/37] REF: extract property exceeds_info_rows --- pandas/io/formats/info.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 8ede3a5c19c71..ba2ba408fcaf7 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -313,16 +313,20 @@ def max_rows(self) -> int: def exceeds_info_cols(self) -> bool: return bool(self.col_count > self.max_cols) - def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: - if show_counts is None: - return bool(not self.exceeds_info_cols and (len(self.data) < self.max_rows)) - else: - return show_counts + @property + def exceeds_info_rows(self) -> bool: + return bool(len(self.data) > self.max_rows) @property def col_count(self) -> int: return len(self.info.ids) + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: + return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) + else: + return show_counts + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: """Save dataframe info into buffer.""" klass = self._select_table_builder() From f160074526be1f728b6084b0a0a3e8b230b6571a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:29:42 +0700 Subject: [PATCH 17/37] DOC: add docstrings in InfoPrinter --- pandas/io/formats/info.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index ba2ba408fcaf7..38499671d1372 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -307,18 +307,22 @@ def max_cols(self, max_cols): @property def max_rows(self) -> int: + """Maximum info rows to be displayed.""" return get_option("display.max_info_rows", len(self.data) + 1) @property def exceeds_info_cols(self) -> bool: + """Check if number of columns to be summarized does not exceed maximum.""" return bool(self.col_count > self.max_cols) @property def exceeds_info_rows(self) -> bool: + """Check if number of rows to be summarized does not exceed maximum.""" return bool(len(self.data) > self.max_rows) @property def col_count(self) -> int: + """Number of columns to be summarized.""" return len(self.info.ids) def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: @@ -337,6 +341,7 @@ def to_buffer(self, buf: Optional[IO[str]] = None) -> None: fmt.buffer_put_lines(buf, lines) def _select_table_builder(self) -> Type["DataFrameTableBuilder"]: + """Select table builder based on verbosity and display settings.""" if self.verbose: return self._select_verbose_table_builder() elif self.verbose is False: # specifically set to False, not necessarily None @@ -348,6 +353,7 @@ def _select_table_builder(self) -> Type["DataFrameTableBuilder"]: return self._select_verbose_table_builder() def _select_verbose_table_builder(self) -> Type["DataFrameTableBuilderVerbose"]: + """Select verbose table builder: with or without non-null counts.""" if self.show_counts: return DataFrameTableBuilderVerboseWithCounts else: From c368a946a6fdcd2f4d0c1bc021e0dc01ec7138cd Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:40:11 +0700 Subject: [PATCH 18/37] CLN: rename counts -> dtype_counts This is more reasonable name for mapping {dtype: number of columns with this dtype}. --- pandas/io/formats/info.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 38499671d1372..9640b8d0a3d7b 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -118,7 +118,7 @@ def ids(self) -> Index: @property @abstractmethod - def counts(self) -> Mapping[str, int]: + def dtype_counts(self) -> Mapping[str, int]: pass @property @@ -161,7 +161,7 @@ def size_qualifier(self) -> str: # all cases (e.g., it misses categorical data even with object # categories) if ( - "object" in self.counts + "object" in self.dtype_counts or self.data.index._is_memory_usage_qualified() ): size_qualifier = "+" @@ -194,7 +194,8 @@ def dtypes(self) -> "Series": return self.data.dtypes @property - def counts(self) -> Mapping[str, int]: + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" # groupby dtype.name to collect e.g. Categorical columns return self.dtypes.value_counts().groupby(lambda x: x.name).sum() @@ -417,9 +418,9 @@ def data(self) -> "DataFrame": return self.info.data @property - def counts(self) -> Mapping[str, int]: - """Mapping column - number of counts.""" - return self.info.counts + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts @property def display_memory_usage(self) -> bool: @@ -473,7 +474,7 @@ def add_body_lines(self) -> None: def add_dtypes_line(self) -> None: """Add summary line with dtypes present in dataframe.""" collected_dtypes = [ - f"{key}({val:d})" for key, val in sorted(self.counts.items()) + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) ] self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") From 43288e1648f19ce8bbb3d0749f8ae06ee42e9122 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:49:11 +0700 Subject: [PATCH 19/37] REF: extract property non_null_counts --- pandas/io/formats/info.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 9640b8d0a3d7b..2e92f4912adad 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -121,6 +121,11 @@ def ids(self) -> Index: def dtype_counts(self) -> Mapping[str, int]: pass + @property + @abstractmethod + def non_null_counts(self) -> Sequence[int]: + pass + @property @abstractmethod def dtypes(self) -> "Series": @@ -199,6 +204,11 @@ def dtype_counts(self) -> Mapping[str, int]: # groupby dtype.name to collect e.g. Categorical columns return self.dtypes.value_counts().groupby(lambda x: x.name).sum() + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns.""" + return self.data.count() + def to_buffer( self, *, @@ -422,6 +432,10 @@ def dtype_counts(self) -> Mapping[str, int]: """Mapping dtype - number of counts.""" return self.info.dtype_counts + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + @property def display_memory_usage(self) -> bool: """Whether to display memory usage.""" @@ -614,6 +628,6 @@ def _get_strcols(self) -> Sequence[Sequence[str]]: columns = list(self._get_columns()) dtypes = list(self._get_dtypes()) non_null_counts = [ - self.count_non_null.format(count=count) for count in self.data.count() + self.count_non_null.format(count=count) for count in self.non_null_counts ] return [line_numbers, columns, non_null_counts, dtypes] From 6cb600a6331cbfa1b1577ba98d9c8e59bcdf5f0a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 18:52:30 +0700 Subject: [PATCH 20/37] REF: extract method _get_non_null_counts --- pandas/io/formats/info.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 2e92f4912adad..58f03b7dfe225 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -618,16 +618,13 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] - @property - def count_non_null(self) -> str: - """String representation of non-null count column data.""" - return "{count} non-null" + def _get_non_null_counts(self) -> Iterator[str]: + for count in self.non_null_counts: + yield f"{count} non-null" def _get_strcols(self) -> Sequence[Sequence[str]]: line_numbers = list(self._get_line_numbers()) columns = list(self._get_columns()) dtypes = list(self._get_dtypes()) - non_null_counts = [ - self.count_non_null.format(count=count) for count in self.non_null_counts - ] + non_null_counts = list(self._get_non_null_counts()) return [line_numbers, columns, non_null_counts, dtypes] From 01bedfb5a57b858eb6d6b5e8f50af4a185d9a690 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:03:24 +0700 Subject: [PATCH 21/37] REF: generate rows for performance improvement Previously there was a need to iterate over all columns multiple times to create strcols. This commit creates row data by iterating over the dataframe columns only once. Then zip(*self.strrows) is used to form strcols. --- pandas/io/formats/info.py | 40 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 58f03b7dfe225..302c3d4f34130 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -522,13 +522,14 @@ class DataFrameTableBuilderVerbose(DataFrameTableBuilder): def __init__(self, *, info, printer): super().__init__(info=info, printer=printer) - self.strcols: Sequence[Sequence[str]] = self._get_strcols() + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) @abstractmethod - def _get_strcols(self) -> Sequence[Sequence[str]]: - """Get columns content. + def _gen_rows(self) -> Iterator[Sequence[str]]: + """Generator function yielding rows content. - Each element of the list represents a column data (list of rows). + Each element represents a row comprising a sequence of strings. """ def add_columns_summary_line(self) -> None: @@ -575,8 +576,7 @@ def add_separator_line(self) -> None: self._lines.append(separator_line) def add_body_lines(self) -> None: - strrows = list(zip(*self.strcols)) - for row in strrows: + for row in self.strrows: body_line = self.SPACING.join( [ _put_str(col, gross_colwidth) @@ -606,11 +606,13 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Dtype"] - def _get_strcols(self) -> Sequence[Sequence[str]]: - line_numbers = list(self._get_line_numbers()) - columns = list(self._get_columns()) - dtypes = list(self._get_dtypes()) - return [line_numbers, columns, dtypes] + def _gen_rows(self) -> Iterator[Sequence[str]]: + for line_no, col, dtype in zip( + self._get_line_numbers(), + self._get_columns(), + self._get_dtypes(), + ): + yield line_no, col, dtype class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): @@ -618,13 +620,15 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] + def _gen_rows(self) -> Iterator[Sequence[str]]: + for line_no, col, count, dtype in zip( + self._get_line_numbers(), + self._get_columns(), + self._get_non_null_counts(), + self._get_dtypes(), + ): + yield line_no, col, count, dtype + def _get_non_null_counts(self) -> Iterator[str]: for count in self.non_null_counts: yield f"{count} non-null" - - def _get_strcols(self) -> Sequence[Sequence[str]]: - line_numbers = list(self._get_line_numbers()) - columns = list(self._get_columns()) - dtypes = list(self._get_dtypes()) - non_null_counts = list(self._get_non_null_counts()) - return [line_numbers, columns, non_null_counts, dtypes] From 5bd4ce11a0c43bf888dbc6198ee1a53f99dbc411 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:13:56 +0700 Subject: [PATCH 22/37] REF: remove COL_SPACE class attribute --- pandas/io/formats/info.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 302c3d4f34130..8902477ce7cb0 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -516,8 +516,7 @@ def add_body_lines(self) -> None: class DataFrameTableBuilderVerbose(DataFrameTableBuilder): """Info table builder for verbose output.""" - COL_SPACE = 2 - SPACING = " " * COL_SPACE + SPACING = " " * 2 HEADERS: Sequence[str] def __init__(self, *, info, printer): From 269c9d7f32688bca4e0751a7de239bb3ab97220b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:19:04 +0700 Subject: [PATCH 23/37] CLN: drop attribute strcols --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 8902477ce7cb0..9129005e4b42c 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -522,7 +522,6 @@ class DataFrameTableBuilderVerbose(DataFrameTableBuilder): def __init__(self, *, info, printer): super().__init__(info=info, printer=printer) self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) @abstractmethod def _gen_rows(self) -> Iterator[Sequence[str]]: @@ -542,7 +541,8 @@ def header_column_widths(self) -> Sequence[int]: @property def body_column_widths(self) -> Sequence[int]: """Widths of table content columns.""" - return [max(len(x) for x in col) for col in self.strcols] + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] @property def gross_column_widths(self) -> Sequence[int]: From e27021a8a707a65f655f7cda61d007e5916d6c68 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:20:53 +0700 Subject: [PATCH 24/37] CLN: simplify iteration, without unpacking --- pandas/io/formats/info.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 9129005e4b42c..6bbaf4720dccd 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -606,12 +606,12 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Dtype"] def _gen_rows(self) -> Iterator[Sequence[str]]: - for line_no, col, dtype in zip( + for items in zip( self._get_line_numbers(), self._get_columns(), self._get_dtypes(), ): - yield line_no, col, dtype + yield items class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): @@ -620,13 +620,13 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] def _gen_rows(self) -> Iterator[Sequence[str]]: - for line_no, col, count, dtype in zip( + for items in zip( self._get_line_numbers(), self._get_columns(), self._get_non_null_counts(), self._get_dtypes(), ): - yield line_no, col, count, dtype + yield items def _get_non_null_counts(self) -> Iterator[str]: for count in self.non_null_counts: From d0c15811ae148f523b7f0aec34e7669ad8db135b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:23:34 +0700 Subject: [PATCH 25/37] CLN: rename _get_* gen functions to _gen_* --- pandas/io/formats/info.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 6bbaf4720dccd..10a749a99c79f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -584,17 +584,17 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) - def _get_line_numbers(self) -> Iterator[str]: + def _gen_line_numbers(self) -> Iterator[str]: """Iterator with string representation of column numbers.""" for i, _ in enumerate(self.ids): yield f" {i}" - def _get_columns(self) -> Iterator[str]: + def _gen_columns(self) -> Iterator[str]: """Iterator with string representation of column names.""" for col in self.ids: yield pprint_thing(col) - def _get_dtypes(self) -> Iterator[str]: + def _gen_dtypes(self) -> Iterator[str]: """Iterator with string representation of column dtypes.""" for dtype in self.dtypes: yield pprint_thing(dtype) @@ -607,9 +607,9 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): def _gen_rows(self) -> Iterator[Sequence[str]]: for items in zip( - self._get_line_numbers(), - self._get_columns(), - self._get_dtypes(), + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), ): yield items @@ -621,13 +621,13 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): def _gen_rows(self) -> Iterator[Sequence[str]]: for items in zip( - self._get_line_numbers(), - self._get_columns(), - self._get_non_null_counts(), - self._get_dtypes(), + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), ): yield items - def _get_non_null_counts(self) -> Iterator[str]: + def _gen_non_null_counts(self) -> Iterator[str]: for count in self.non_null_counts: yield f"{count} non-null" From d9c8b7c42a8c2e59d392871563dfa50da993c634 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:28:49 +0700 Subject: [PATCH 26/37] CLN: avoid unpacking in for loop --- pandas/io/formats/info.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 10a749a99c79f..ff73cfed8a9af 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -548,10 +548,8 @@ def body_column_widths(self) -> Sequence[int]: def gross_column_widths(self) -> Sequence[int]: """Widths of columns containing both headers and actual content.""" return [ - max(header_colwidth, body_colwidth) - for header_colwidth, body_colwidth in zip( - self.header_column_widths, self.body_column_widths - ) + max(*widths) + for widths in zip(self.header_column_widths, self.body_column_widths) ] def add_header_line(self) -> None: From 5bdb1b1685d200ee6df0761ee3a84161963143a0 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:36:28 +0700 Subject: [PATCH 27/37] REF: replace setter with initializer function --- pandas/io/formats/info.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index ff73cfed8a9af..99426162ebd1f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -155,6 +155,7 @@ def mem_usage(self) -> int: @property def memory_usage_string(self) -> str: + """Memory usage in a form of human readable string.""" return f"{_sizeof_fmt(self.mem_usage, self.size_qualifier)}\n" @property @@ -301,21 +302,10 @@ def __init__( ): self.info = info self.data = info.data - self.max_cols = max_cols self.verbose = verbose + self.max_cols = self._initialize_max_cols(max_cols) self.show_counts = self._initialize_show_counts(show_counts) - @property - def max_cols(self): - return self._max_cols - - @max_cols.setter - def max_cols(self, max_cols): - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", self.col_count + 1) - self._max_cols = max_cols - @property def max_rows(self) -> int: """Maximum info rows to be displayed.""" @@ -336,6 +326,11 @@ def col_count(self) -> int: """Number of columns to be summarized.""" return len(self.info.ids) + def _initialize_max_cols(self, max_cols: Optional[int]) -> int: + if max_cols is None: + return get_option("display.max_info_columns", self.col_count + 1) + return max_cols + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: if show_counts is None: return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) From 05c8a02641fc6248275e1dc500cf8fa0a7b919a5 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:45:38 +0700 Subject: [PATCH 28/37] REF: eliminate dep of TableBuilder on InfoPrinter --- pandas/io/formats/info.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 99426162ebd1f..a2aea7b043aac 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -210,6 +210,11 @@ def non_null_counts(self) -> Sequence[int]: """Sequence of non-null counts for all columns.""" return self.data.count() + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + def to_buffer( self, *, @@ -324,7 +329,7 @@ def exceeds_info_rows(self) -> bool: @property def col_count(self) -> int: """Number of columns to be summarized.""" - return len(self.info.ids) + return self.info.col_count def _initialize_max_cols(self, max_cols: Optional[int]) -> int: if max_cols is None: @@ -340,7 +345,7 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: def to_buffer(self, buf: Optional[IO[str]] = None) -> None: """Save dataframe info into buffer.""" klass = self._select_table_builder() - table_builder = klass(info=self.info, printer=self) + table_builder = klass(info=self.info) lines = table_builder.get_lines() if buf is None: # pragma: no cover buf = sys.stdout @@ -373,15 +378,12 @@ class TableBuilderAbstract(ABC): ---------- info : BaseInfo Instance of DataFrameInfo or SeriesInfo. - printer : InfoPrinter - Instance of InfoPrinter. """ _lines: List[str] - def __init__(self, *, info, printer): + def __init__(self, *, info): self.info = info - self.printer = printer @abstractmethod def get_lines(self) -> List[str]: @@ -453,8 +455,8 @@ def dtypes(self) -> "Series": @property def col_count(self) -> int: - """Number of dataframe columns.""" - return self.printer.col_count + """Number of dataframe columns to be summarized.""" + return self.info.col_count def add_object_type_line(self) -> None: """Add line with string representation of dataframe to the table.""" @@ -499,13 +501,13 @@ def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) def add_header_line(self) -> None: - pass + """No header in non-verbose output.""" def add_separator_line(self) -> None: - pass + """No separator in non-verbose output.""" def add_body_lines(self) -> None: - pass + """No body in non-verbose output.""" class DataFrameTableBuilderVerbose(DataFrameTableBuilder): @@ -514,8 +516,8 @@ class DataFrameTableBuilderVerbose(DataFrameTableBuilder): SPACING = " " * 2 HEADERS: Sequence[str] - def __init__(self, *, info, printer): - super().__init__(info=info, printer=printer) + def __init__(self, *, info): + super().__init__(info=info) self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) @abstractmethod From d8f27f2f737cab41718d272b57eafd8506cba1b6 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 19:50:37 +0700 Subject: [PATCH 29/37] CLN: use yield from in _gen_rows --- pandas/io/formats/info.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a2aea7b043aac..3750e98c25798 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -601,12 +601,11 @@ class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Dtype"] def _gen_rows(self) -> Iterator[Sequence[str]]: - for items in zip( + yield from zip( self._gen_line_numbers(), self._gen_columns(), self._gen_dtypes(), - ): - yield items + ) class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): @@ -615,13 +614,12 @@ class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] def _gen_rows(self) -> Iterator[Sequence[str]]: - for items in zip( + yield from zip( self._gen_line_numbers(), self._gen_columns(), self._gen_non_null_counts(), self._gen_dtypes(), - ): - yield items + ) def _gen_non_null_counts(self) -> Iterator[str]: for count in self.non_null_counts: From 36d91715b9ee2c2a3a5d1a1d0703e283e92524f4 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 20:04:32 +0700 Subject: [PATCH 30/37] DOC: add docstrings to abstract properties --- pandas/io/formats/info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 3750e98c25798..2b34427d56755 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -114,17 +114,17 @@ def _initialize_memory_usage( @property @abstractmethod def ids(self) -> Index: - pass + """Column names or index names.""" @property @abstractmethod def dtype_counts(self) -> Mapping[str, int]: - pass + """Mapping dtype - number of counts.""" @property @abstractmethod def non_null_counts(self) -> Sequence[int]: - pass + """Sequence of non-null counts for all columns or column (if series).""" @property @abstractmethod From fb5c0e48111012634e9d1ddb2e0e9d7b5677e42a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 13:42:24 +0700 Subject: [PATCH 31/37] REF: remove method for verbose builder selection The logic with the selection of verbose builder was moved to function _select_table_builder. This makes the logic quite more complicated though. --- pandas/io/formats/info.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 919dddee5178e..4bf9428737f9c 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -355,21 +355,20 @@ def to_buffer(self, buf: Optional[IO[str]] = None) -> None: def _select_table_builder(self) -> Type["DataFrameTableBuilder"]: """Select table builder based on verbosity and display settings.""" if self.verbose: - return self._select_verbose_table_builder() + if self.show_counts: + return DataFrameTableBuilderVerboseWithCounts + else: + return DataFrameTableBuilderVerboseNoCounts elif self.verbose is False: # specifically set to False, not necessarily None return DataFrameTableBuilderNonVerbose else: if self.exceeds_info_cols: return DataFrameTableBuilderNonVerbose else: - return self._select_verbose_table_builder() - - def _select_verbose_table_builder(self) -> Type["DataFrameTableBuilderVerbose"]: - """Select verbose table builder: with or without non-null counts.""" - if self.show_counts: - return DataFrameTableBuilderVerboseWithCounts - else: - return DataFrameTableBuilderVerboseNoCounts + if self.show_counts: + return DataFrameTableBuilderVerboseWithCounts + else: + return DataFrameTableBuilderVerboseNoCounts class TableBuilderAbstract(ABC): From a41b04d5c7ff3b01473d06008ed909069e04b511 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 13:58:46 +0700 Subject: [PATCH 32/37] REF: parametrize verbose table builders Previously there were two separate classes for verbose table builder. - DataFrameTableBuilderVerboseWithCounts - DataFrameTableBuilderVerboseNoCounts The review considered it as a complex pattern. The present commit leaves DataFrameTableBuilderVerbose, makes is parametrized (with_counts = True/False). --- pandas/io/formats/info.py | 108 ++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 4bf9428737f9c..3504d72612c2d 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,16 +1,6 @@ from abc import ABC, abstractmethod import sys -from typing import ( - IO, - TYPE_CHECKING, - Iterator, - List, - Mapping, - Optional, - Sequence, - Type, - Union, -) +from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union from pandas._config import get_option @@ -345,30 +335,31 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: def to_buffer(self, buf: Optional[IO[str]] = None) -> None: """Save dataframe info into buffer.""" - klass = self._select_table_builder() - table_builder = klass(info=self.info) + table_builder = self._create_table_builder() lines = table_builder.get_lines() if buf is None: # pragma: no cover buf = sys.stdout fmt.buffer_put_lines(buf, lines) - def _select_table_builder(self) -> Type["DataFrameTableBuilder"]: - """Select table builder based on verbosity and display settings.""" + def _create_table_builder(self) -> "DataFrameTableBuilder": + """ + Create instance of table builder based on verbosity and display settings. + """ if self.verbose: - if self.show_counts: - return DataFrameTableBuilderVerboseWithCounts - else: - return DataFrameTableBuilderVerboseNoCounts + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) elif self.verbose is False: # specifically set to False, not necessarily None - return DataFrameTableBuilderNonVerbose + return DataFrameTableBuilderNonVerbose(info=self.info) else: if self.exceeds_info_cols: - return DataFrameTableBuilderNonVerbose + return DataFrameTableBuilderNonVerbose(info=self.info) else: - if self.show_counts: - return DataFrameTableBuilderVerboseWithCounts - else: - return DataFrameTableBuilderVerboseNoCounts + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) class TableBuilderAbstract(ABC): @@ -514,18 +505,33 @@ class DataFrameTableBuilderVerbose(DataFrameTableBuilder): """Info table builder for verbose output.""" SPACING = " " * 2 - HEADERS: Sequence[str] - def __init__(self, *, info): + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): super().__init__(info=info) + self.with_counts = with_counts self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - @abstractmethod + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + def _gen_rows(self) -> Iterator[Sequence[str]]: """Generator function yielding rows content. Each element represents a row comprising a sequence of strings. """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() def add_columns_summary_line(self) -> None: self._lines.append(f"Data columns (total {self.col_count} columns):") @@ -533,7 +539,7 @@ def add_columns_summary_line(self) -> None: @property def header_column_widths(self) -> Sequence[int]: """Widths of header columns (only titles).""" - return [len(col) for col in self.HEADERS] + return [len(col) for col in self.headers] @property def body_column_widths(self) -> Sequence[int]: @@ -553,7 +559,7 @@ def add_header_line(self) -> None: header_line = self.SPACING.join( [ _put_str(header, col_width) - for header, col_width in zip(self.HEADERS, self.gross_column_widths) + for header, col_width in zip(self.headers, self.gross_column_widths) ] ) self._lines.append(header_line) @@ -579,6 +585,21 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), + ) + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), + ) + def _gen_line_numbers(self) -> Iterator[str]: """Iterator with string representation of column numbers.""" for i, _ in enumerate(self.ids): @@ -594,33 +615,6 @@ def _gen_dtypes(self) -> Iterator[str]: for dtype in self.dtypes: yield pprint_thing(dtype) - -class DataFrameTableBuilderVerboseNoCounts(DataFrameTableBuilderVerbose): - """Verbose info table builder without non-null counts column.""" - - HEADERS = [" # ", "Column", "Dtype"] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - yield from zip( - self._gen_line_numbers(), - self._gen_columns(), - self._gen_dtypes(), - ) - - -class DataFrameTableBuilderVerboseWithCounts(DataFrameTableBuilderVerbose): - """Verbose info table builder with non-null counts column.""" - - HEADERS = [" # ", "Column", "Non-Null Count", "Dtype"] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - yield from zip( - self._gen_line_numbers(), - self._gen_columns(), - self._gen_non_null_counts(), - self._gen_dtypes(), - ) - def _gen_non_null_counts(self) -> Iterator[str]: for count in self.non_null_counts: yield f"{count} non-null" From 1f0c4105e8baeabde434f05fcba6198299ff44eb Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 14:06:40 +0700 Subject: [PATCH 33/37] REF: move static method to module level function --- pandas/io/formats/info.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 3504d72612c2d..5b453c9934cfd 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -73,6 +73,15 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" +def _initialize_memory_usage( + memory_usage: Optional[Union[bool, str]] = None, +) -> Union[bool, str]: + """Get memory usage based on inputs and display options.""" + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage + + class BaseInfo(ABC): """Base class for DataFrameInfo and SeriesInfo. @@ -92,15 +101,7 @@ def __init__( memory_usage: Optional[Union[bool, str]] = None, ): self.data = data - self.memory_usage = self._initialize_memory_usage(memory_usage) - - @staticmethod - def _initialize_memory_usage( - memory_usage: Optional[Union[bool, str]] = None, - ) -> Union[bool, str]: - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - return memory_usage + self.memory_usage = _initialize_memory_usage(memory_usage) @property @abstractmethod From 171d0287b0fa591c5ef73afe84e91b87bd75ec67 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 14:06:58 +0700 Subject: [PATCH 34/37] CLN: rename mem_usage -> memory_usage_bytes --- pandas/io/formats/info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 5b453c9934cfd..c05a20199c425 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -131,12 +131,12 @@ def dtypes(self) -> "Series": return self.data.dtypes @property - def mem_usage(self) -> int: + def memory_usage_bytes(self) -> int: """Memory usage in bytes. Returns ------- - mem_usage : int + memory_usage_bytes : int Object's total memory usage in bytes. """ if self.memory_usage == "deep": @@ -148,7 +148,7 @@ def mem_usage(self) -> int: @property def memory_usage_string(self) -> str: """Memory usage in a form of human readable string.""" - return f"{_sizeof_fmt(self.mem_usage, self.size_qualifier)}\n" + return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" @property def size_qualifier(self) -> str: From c4c002e41505abde0e141eeb4fcced3283e921be Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 15:07:04 +0700 Subject: [PATCH 35/37] DOC: add docstrings to iterator funcs --- pandas/io/formats/info.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c05a20199c425..d104596e93f0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -587,6 +587,7 @@ def add_body_lines(self) -> None: self._lines.append(body_line) def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" yield from zip( self._gen_line_numbers(), self._gen_columns(), @@ -594,6 +595,7 @@ def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: ) def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" yield from zip( self._gen_line_numbers(), self._gen_columns(), @@ -617,5 +619,6 @@ def _gen_dtypes(self) -> Iterator[str]: yield pprint_thing(dtype) def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" for count in self.non_null_counts: yield f"{count} non-null" From 92c77546a672c8544cc025d59c9fe6170ffa5a8c Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 22:45:24 +0700 Subject: [PATCH 36/37] PERF: property gross_column_widths to attribute --- pandas/io/formats/info.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d104596e93f0f..cf6f1c694e2c8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -516,6 +516,7 @@ def __init__( super().__init__(info=info) self.with_counts = with_counts self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths = self._get_gross_column_widths() @property def headers(self) -> Sequence[str]: @@ -542,20 +543,19 @@ def header_column_widths(self) -> Sequence[int]: """Widths of header columns (only titles).""" return [len(col) for col in self.headers] - @property - def body_column_widths(self) -> Sequence[int]: - """Widths of table content columns.""" - strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) - return [max(len(x) for x in col) for col in strcols] - - @property - def gross_column_widths(self) -> Sequence[int]: + def _get_gross_column_widths(self) -> Sequence[int]: """Widths of columns containing both headers and actual content.""" + body_column_widths = self._get_body_column_widths() return [ max(*widths) - for widths in zip(self.header_column_widths, self.body_column_widths) + for widths in zip(self.header_column_widths, body_column_widths) ] + def _get_body_column_widths(self) -> Sequence[int]: + """Widths of table content columns.""" + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] + def add_header_line(self) -> None: header_line = self.SPACING.join( [ From 5fe1cd7e8d72d2c1df30ce7f5e232f9f5fffe7c0 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 22:55:25 +0700 Subject: [PATCH 37/37] TYP: attribute gross_column_widths --- pandas/io/formats/info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index cf6f1c694e2c8..891b3ea7af0e2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -516,7 +516,7 @@ def __init__( super().__init__(info=info) self.with_counts = with_counts self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths = self._get_gross_column_widths() + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() @property def headers(self) -> Sequence[str]: @@ -544,7 +544,7 @@ def header_column_widths(self) -> Sequence[int]: return [len(col) for col in self.headers] def _get_gross_column_widths(self) -> Sequence[int]: - """Widths of columns containing both headers and actual content.""" + """Get widths of columns containing both headers and actual content.""" body_column_widths = self._get_body_column_widths() return [ max(*widths) @@ -552,7 +552,7 @@ def _get_gross_column_widths(self) -> Sequence[int]: ] def _get_body_column_widths(self) -> Sequence[int]: - """Widths of table content columns.""" + """Get widths of table content columns.""" strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) return [max(len(x) for x in col) for col in strcols]