diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 71903d10a6983..5becbf0a87472 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -217,6 +217,7 @@ Other enhancements - Added "Juneteenth National Independence Day" to ``USFederalHolidayCalendar``. See also `Other API changes`_. - :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c85c4e961d99..8ebdb3b890d77 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -206,6 +206,7 @@ format as fmt, ) from pandas.io.formats.info import ( + INFO_DOCSTRING, DataFrameInfo, frame_sub_kwargs, ) @@ -3138,7 +3139,7 @@ def to_xml( return xml_formatter.write_output() # ---------------------------------------------------------------------- - @doc(DataFrameInfo.render, **frame_sub_kwargs) + @doc(INFO_DOCSTRING, **frame_sub_kwargs) def info( self, verbose: bool | None = None, diff --git a/pandas/core/series.py b/pandas/core/series.py index ffa31b4f66211..b3133ee1275a1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -139,6 +139,11 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt +from pandas.io.formats.info import ( + INFO_DOCSTRING, + SeriesInfo, + series_sub_kwargs, +) import pandas.plotting if TYPE_CHECKING: @@ -4914,6 +4919,22 @@ def replace( method=method, ) + @doc(INFO_DOCSTRING, **series_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: IO[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool = True, + ) -> None: + return SeriesInfo(self, memory_usage).render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + def _replace_single(self, to_replace, method: str, inplace: bool, limit): """ Replaces values in a Series using the fill method specified when no diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 9340d020cd6ce..4a9310c6dccf8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -20,7 +20,6 @@ Dtype, WriteBuffer, ) -from pandas.util._decorators import doc from pandas.core.indexes.api import Index @@ -51,7 +50,11 @@ only if the DataFrame is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. + shows the counts, and False never shows the counts.""" +) + +null_counts_sub = dedent( + """ null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" @@ -157,12 +160,94 @@ "type_sub": " and columns", "max_cols_sub": frame_max_cols_sub, "show_counts_sub": show_counts_sub, + "null_counts_sub": null_counts_sub, "examples_sub": frame_examples_sub, "see_also_sub": frame_see_also_sub, "version_added_sub": "", } +series_examples_sub = dedent( + """\ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Int64Index: 5 entries, 1 to 5 + Series name: None + Non-Null Count Dtype + -------------- ----- + 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Int64Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 55.3 MB""" +) + + +series_see_also_sub = dedent( + """\ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" +) + + +series_sub_kwargs = { + "klass": "Series", + "type_sub": "", + "max_cols_sub": "", + "show_counts_sub": show_counts_sub, + "null_counts_sub": "", + "examples_sub": series_examples_sub, + "see_also_sub": series_see_also_sub, + "version_added_sub": "\n.. versionadded:: 1.4.0\n", +} + + INFO_DOCSTRING = dedent( """ Print a concise summary of a {klass}. @@ -181,7 +266,7 @@ buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process - the output. + the output.\ {max_cols_sub} memory_usage : bool, str, optional Specifies whether total memory usage of the {klass} @@ -196,7 +281,7 @@ consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - {show_counts_sub} + {show_counts_sub}{null_counts_sub} Returns ------- @@ -422,16 +507,6 @@ def memory_usage_bytes(self) -> int: deep = False return self.data.memory_usage(index=True, deep=deep).sum() - @doc( - INFO_DOCSTRING, - klass="DataFrame", - type_sub=" and columns", - max_cols_sub=frame_max_cols_sub, - show_counts_sub=show_counts_sub, - examples_sub=frame_examples_sub, - see_also_sub=frame_see_also_sub, - version_added_sub="", - ) def render( self, *, @@ -449,6 +524,69 @@ def render( printer.to_buffer(buf) +class SeriesInfo(BaseInfo): + """ + Class storing series-specific info. + """ + + def __init__( + self, + data: Series, + memory_usage: bool | str | None = None, + ): + self.data: Series = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + def render( + self, + *, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + verbose: bool | None = None, + show_counts: bool | None = None, + ) -> None: + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) + printer = SeriesInfoPrinter( + info=self, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) + + @property + def non_null_counts(self) -> Sequence[int]: + return [self.data.count()] + + @property + def dtypes(self) -> Iterable[Dtype]: + return [self.data.dtypes] + + @property + def dtype_counts(self): + from pandas.core.frame import DataFrame + + return _get_dataframe_dtype_counts(DataFrame(self.data)) + + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. + + Returns + ------- + memory_usage_bytes : int + Object's total memory usage in bytes. + """ + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep) + + class InfoPrinterAbstract: """ Class for printing dataframe or series info. @@ -548,6 +686,49 @@ def _create_table_builder(self) -> DataFrameTableBuilder: ) +class SeriesInfoPrinter(InfoPrinterAbstract): + """Class for printing series info. + + Parameters + ---------- + info : SeriesInfo + Instance of SeriesInfo. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ + + def __init__( + self, + info: SeriesInfo, + verbose: bool | None = None, + show_counts: bool | None = None, + ): + self.info = info + self.data = info.data + self.verbose = verbose + self.show_counts = self._initialize_show_counts(show_counts) + + def _create_table_builder(self) -> SeriesTableBuilder: + """ + Create instance of table builder based on verbosity. + """ + if self.verbose or self.verbose is None: + return SeriesTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) + else: + return SeriesTableBuilderNonVerbose(info=self.info) + + def _initialize_show_counts(self, show_counts: bool | None) -> bool: + if show_counts is None: + return True + else: + return show_counts + + class TableBuilderAbstract(ABC): """ Abstract builder for info table. @@ -832,6 +1013,102 @@ def _gen_columns(self) -> Iterator[str]: yield pprint_thing(col) +class SeriesTableBuilder(TableBuilderAbstract): + """ + Abstract builder for series info table. + + Parameters + ---------- + info : SeriesInfo. + Instance of SeriesInfo. + """ + + def __init__(self, *, info: SeriesInfo): + self.info: SeriesInfo = info + + def get_lines(self) -> list[str]: + self._lines = [] + self._fill_non_empty_info() + return self._lines + + @property + def data(self) -> Series: + """Series.""" + return self.info.data + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + + +class SeriesTableBuilderNonVerbose(SeriesTableBuilder): + """ + Series info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + +class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin): + """ + Series info table builder for verbose output. + """ + + def __init__( + self, + *, + info: SeriesInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_series_name_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + def add_series_name_line(self): + self._lines.append(f"Series name: {self.data.name}") + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return ["Non-Null Count", "Dtype"] + return ["Dtype"] + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from self._gen_dtypes() + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_non_null_counts(), + self._gen_dtypes(), + ) + + def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]: """ Create mapping between datatypes and their number of occurrences. diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py new file mode 100644 index 0000000000000..1f755d675d078 --- /dev/null +++ b/pandas/tests/io/formats/test_series_info.py @@ -0,0 +1,183 @@ +from io import StringIO +from string import ascii_uppercase as uppercase +import textwrap + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import ( + CategoricalIndex, + MultiIndex, + Series, + date_range, +) + + +def test_info_categorical_column_just_works(): + n = 2500 + data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + s = Series(data).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) + + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + s = Series(np.zeros(2), index=idx) + buf = StringIO() + s.info(buf=buf) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + result = buf.getvalue() + + expected = textwrap.dedent( + """\ + + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') + """ + ) + if verbose: + expected += textwrap.dedent( + """\ + Series name: sth + Non-Null Count Dtype + -------------- ----- + 10 non-null int64 + """ + ) + expected += textwrap.dedent( + f"""\ + dtypes: int64(1) + memory usage: {s.memory_usage()}.0+ bytes + """ + ) + assert result == expected + + +def test_info_memory(): + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + memory_bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {memory_bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + s = Series(np.random.randn(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + + +def test_info_shows_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + n = 10 + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + + +@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + + +@pytest.mark.parametrize( + "series, plus", + [ + (Series(1, index=[1, 2, 3]), False), + (Series(1, index=list("ABC")), True), + (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ( + Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), + True, + ), + ], +) +def test_info_memory_usage_qualified(series, plus): + buf = StringIO() + series.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) + assert diff < 2000 diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 4e4eb89328540..a01b8d304d05d 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -110,12 +110,6 @@ def test_not_hashable(self): def test_contains(self, datetime_series): tm.assert_contains_all(datetime_series.index, datetime_series) - def test_raise_on_info(self): - s = Series(np.random.randn(10)) - msg = "'Series' object has no attribute 'info'" - with pytest.raises(AttributeError, match=msg): - s.info() - def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))