From 1a86a5034814c7290681dc991bfe64314daeaa59 Mon Sep 17 00:00:00 2001 From: gcerri Date: Thu, 8 Aug 2024 16:00:16 +0200 Subject: [PATCH 1/8] Add option to DataFrame.info for structured output --- pandas/core/frame.py | 6 +++++- pandas/io/formats/info.py | 41 +++++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea91046f4b8e4..9d7321138bd23 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3529,17 +3529,21 @@ def info( max_cols: int | None = None, memory_usage: bool | str | None = None, show_counts: bool | None = None, + return_dict: bool | None = None, ) -> None: info = DataFrameInfo( data=self, memory_usage=memory_usage, ) - info.render( + info_return = info.render( buf=buf, max_cols=max_cols, verbose=verbose, show_counts=show_counts, + return_dict=return_dict, ) + if return_dict: + return info_return def memory_usage(self, index: bool = True, deep: bool = False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 469dcfb76ba0b..706f8f08fd265 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -494,7 +494,28 @@ def non_null_counts(self) -> Series: def memory_usage_bytes(self) -> int: deep = self.memory_usage == "deep" return self.data.memory_usage(index=True, deep=deep).sum() - + + def to_dict(self) -> dict: + """Return DataFrame info as a dictionary.""" + return { + 'Column summary': self._get_column_summary(), + 'Memory usage': self.memory_usage_bytes, + 'Index type': type(self.data.index).__name__, + 'Index entries': len(self.data.index), + } + + def _get_column_summary(self) -> list[dict]: + """Return a DataFrame summarizing columns.""" + return [ + { + '#': i, + 'Column': col, + 'Non-Null Count': self.data[col].notna().sum(), + 'Dtype': self.data[col].dtype + } + for i, col in enumerate(self.ids) + ] + def render( self, *, @@ -502,14 +523,18 @@ def render( max_cols: int | None, verbose: bool | None, show_counts: bool | None, + return_dict: bool | None, ) -> None: - printer = _DataFrameInfoPrinter( - info=self, - max_cols=max_cols, - verbose=verbose, - show_counts=show_counts, - ) - printer.to_buffer(buf) + if return_dict: + return self.to_dict() + else: + printer = _DataFrameInfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) class SeriesInfo(_BaseInfo): From 0abc4f4cef756bff62bf0f793912a29832ead70b Mon Sep 17 00:00:00 2001 From: gcerri Date: Fri, 9 Aug 2024 11:51:44 +0200 Subject: [PATCH 2/8] adding dash in the keys of df.info() as dictionary --- pandas/io/formats/info.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 706f8f08fd265..c7642f0964ab6 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -498,10 +498,10 @@ def memory_usage_bytes(self) -> int: def to_dict(self) -> dict: """Return DataFrame info as a dictionary.""" return { - 'Column summary': self._get_column_summary(), - 'Memory usage': self.memory_usage_bytes, - 'Index type': type(self.data.index).__name__, - 'Index entries': len(self.data.index), + 'Column_summary': self._get_column_summary(), + 'Memory_usage': self.memory_usage_bytes, + 'Index_type': type(self.data.index).__name__, + 'Index_entries': len(self.data.index), } def _get_column_summary(self) -> list[dict]: @@ -510,7 +510,7 @@ def _get_column_summary(self) -> list[dict]: { '#': i, 'Column': col, - 'Non-Null Count': self.data[col].notna().sum(), + 'Non-Null-Count': self.data[col].notna().sum(), 'Dtype': self.data[col].dtype } for i, col in enumerate(self.ids) From dd48b41ac1363ead8bb389155031fa9552698b1f Mon Sep 17 00:00:00 2001 From: gcerri Date: Fri, 9 Aug 2024 11:52:51 +0200 Subject: [PATCH 3/8] Add unit tests for info() with return_dict=True --- pandas/tests/frame/methods/test_info.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index a4319f8a8ae7f..4771f435cc8fa 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -569,3 +569,28 @@ def test_info_show_counts(row, columns, show_counts, result): with StringIO() as buf: df.info(buf=buf, show_counts=show_counts) assert ("non-null" in buf.getvalue()) is result + +@pytest.mark.parametrize( + "df", [ + DataFrame({ + 'A': [1, 2, 3], + 'B': [4, 5, 6] + }), + DataFrame({}), + ] +) +def test_info_return_dict(df): + result = df.info(return_dict=True) + expected_keys = {'Column_summary', 'Memory_usage', 'Index_type', 'Index_entries'} + assert isinstance(result, dict) + assert expected_keys.issubset(result.keys()) + + assert 'Column_summary' in result + assert 'Memory_usage' in result + assert 'Index_type' in result + assert 'Index_entries' in result + + assert isinstance(result['Column_summary'], list) + assert isinstance(result['Memory_usage'], np.int64) + assert isinstance(result['Index_type'], str) + assert isinstance(result['Index_entries'], int) \ No newline at end of file From 3453f874eb816473241436afa4a55e3a23fc4a48 Mon Sep 17 00:00:00 2001 From: gcerri Date: Fri, 9 Aug 2024 12:28:45 +0200 Subject: [PATCH 4/8] Added return_dict of DataFrame.info into the doc --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 32c98fbf9d655..f960a9f03476b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -48,6 +48,7 @@ Other enhancements - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) +- :meth:`DataFrame.info` now have a ``return_dict`` parameter (:issue:`#59387`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) From 35a5842c2f838c899fbf473bbd0a925dc745ae75 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:25:57 +0000 Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/io/formats/info.py | 20 ++++++++-------- pandas/tests/frame/methods/test_info.py | 31 ++++++++++++------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c7642f0964ab6..cf6134f77e7b6 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -494,28 +494,28 @@ def non_null_counts(self) -> Series: def memory_usage_bytes(self) -> int: deep = self.memory_usage == "deep" return self.data.memory_usage(index=True, deep=deep).sum() - + def to_dict(self) -> dict: """Return DataFrame info as a dictionary.""" return { - 'Column_summary': self._get_column_summary(), - 'Memory_usage': self.memory_usage_bytes, - 'Index_type': type(self.data.index).__name__, - 'Index_entries': len(self.data.index), + "Column_summary": self._get_column_summary(), + "Memory_usage": self.memory_usage_bytes, + "Index_type": type(self.data.index).__name__, + "Index_entries": len(self.data.index), } def _get_column_summary(self) -> list[dict]: """Return a DataFrame summarizing columns.""" return [ { - '#': i, - 'Column': col, - 'Non-Null-Count': self.data[col].notna().sum(), - 'Dtype': self.data[col].dtype + "#": i, + "Column": col, + "Non-Null-Count": self.data[col].notna().sum(), + "Dtype": self.data[col].dtype, } for i, col in enumerate(self.ids) ] - + def render( self, *, diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 4771f435cc8fa..bd7aa0e8f1f06 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -570,27 +570,26 @@ def test_info_show_counts(row, columns, show_counts, result): df.info(buf=buf, show_counts=show_counts) assert ("non-null" in buf.getvalue()) is result + @pytest.mark.parametrize( - "df", [ - DataFrame({ - 'A': [1, 2, 3], - 'B': [4, 5, 6] - }), + "df", + [ + DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({}), - ] + ], ) def test_info_return_dict(df): result = df.info(return_dict=True) - expected_keys = {'Column_summary', 'Memory_usage', 'Index_type', 'Index_entries'} + expected_keys = {"Column_summary", "Memory_usage", "Index_type", "Index_entries"} assert isinstance(result, dict) assert expected_keys.issubset(result.keys()) - assert 'Column_summary' in result - assert 'Memory_usage' in result - assert 'Index_type' in result - assert 'Index_entries' in result - - assert isinstance(result['Column_summary'], list) - assert isinstance(result['Memory_usage'], np.int64) - assert isinstance(result['Index_type'], str) - assert isinstance(result['Index_entries'], int) \ No newline at end of file + assert "Column_summary" in result + assert "Memory_usage" in result + assert "Index_type" in result + assert "Index_entries" in result + + assert isinstance(result["Column_summary"], list) + assert isinstance(result["Memory_usage"], np.int64) + assert isinstance(result["Index_type"], str) + assert isinstance(result["Index_entries"], int) From 992d57963a0f9eed43fe3787cb0f94d78284d583 Mon Sep 17 00:00:00 2001 From: gcerri Date: Fri, 9 Aug 2024 15:07:10 +0200 Subject: [PATCH 6/8] Adding doc and fix typing --- pandas/io/formats/info.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index cf6134f77e7b6..092fb26f6ee48 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -54,6 +54,15 @@ ) +return_dict_sub = dedent( + """\ + return_dict : bool, optional + Whether to return the summary as a dictionary. If True, the method + returns a dictionary containing information about the DataFrame. + If False, the summary is printed and None is returned.""" +) + + frame_examples_sub = dedent( """\ >>> int_values = [1, 2, 3, 4, 5] @@ -136,7 +145,11 @@ 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 165.9 MB""" + memory usage: 165.9 MB + + >>> info_dict = df.info(return_dict=True) + >>> print(info_dict) + {'Column_summary': '...', 'Memory_usage': 24000128, 'Index_type': 'RangeIndex', 'Index_entries': 1000000}""" ) @@ -153,6 +166,7 @@ "type_sub": " and columns", "max_cols_sub": frame_max_cols_sub, "show_counts_sub": show_counts_sub, + "return_dict_sub": return_dict_sub, "examples_sub": frame_examples_sub, "see_also_sub": frame_see_also_sub, "version_added_sub": "", @@ -233,6 +247,7 @@ "type_sub": "", "max_cols_sub": "", "show_counts_sub": show_counts_sub, + "return_dict_sub": return_dict_sub, "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, "version_added_sub": "\n.. versionadded:: 1.4.0\n", @@ -273,11 +288,13 @@ :ref:`Frequently Asked Questions ` for more details. {show_counts_sub} - + {return_dict_sub} + Returns ------- - None - This method prints a summary of a {klass} and returns None. + dict or None + If return_dict is True, returns a dictionary summarizing the {klass}. + Otherwise, returns None. See Also -------- @@ -435,7 +452,7 @@ def render( max_cols: int | None, verbose: bool | None, show_counts: bool | None, - ) -> None: + ) -> None | dict: pass @@ -524,7 +541,7 @@ def render( verbose: bool | None, show_counts: bool | None, return_dict: bool | None, - ) -> None: + ) -> None | dict: if return_dict: return self.to_dict() else: From f388827d5a9d04f3b0746f8f91b9bc8027ef42bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 13:12:46 +0000 Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 092fb26f6ee48..8c8cc145194ef 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -146,7 +146,7 @@ 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 165.9 MB - + >>> info_dict = df.info(return_dict=True) >>> print(info_dict) {'Column_summary': '...', 'Memory_usage': 24000128, 'Index_type': 'RangeIndex', 'Index_entries': 1000000}""" @@ -289,7 +289,7 @@ details. {show_counts_sub} {return_dict_sub} - + Returns ------- dict or None From 765d1da507a6507ea30086f1d7ea0b9ed1492b85 Mon Sep 17 00:00:00 2001 From: gcerri Date: Fri, 9 Aug 2024 15:23:30 +0200 Subject: [PATCH 8/8] Fix line length issue in doctest example --- pandas/io/formats/info.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 8c8cc145194ef..8629eb0ff6368 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -149,7 +149,8 @@ >>> info_dict = df.info(return_dict=True) >>> print(info_dict) - {'Column_summary': '...', 'Memory_usage': 24000128, 'Index_type': 'RangeIndex', 'Index_entries': 1000000}""" + {'Column_summary': '...', 'Memory_usage': 24000128, + 'Index_type': 'RangeIndex', 'Index_entries': 1000000}""" )