diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1286577748afa..4636889c37c30 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -118,6 +118,7 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 48fae9a0a91cd..6f2ea68a78506 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -96,6 +96,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt +from pandas.io.formats.info import SeriesInfo import pandas.plotting if TYPE_CHECKING: @@ -4551,6 +4552,94 @@ def replace( method=method, ) + @Substitution( + klass="Series", + type_sub="", + max_cols_sub="", + examples_sub=( + """ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Int64Index: 5 entries, 1 to 5 + Series name: None + Non-Null Count Dtype + -------------- ----- + 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Int64Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 55.3 MB""" + ), + see_also_sub=( + """ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" + ), + ) + @doc(SeriesInfo.info) + def info( + self, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ) -> None: + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) + return SeriesInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).info() + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7a53b46a4ac0f..cbb5ae9750c0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union, cast from pandas._config import get_option @@ -15,6 +15,32 @@ from pandas.core.series import Series # noqa: F401 +class CountConfigs(NamedTuple): + """ + Configs with which to display counts. + + Attributes + ---------- + counts : Series + Non-null count of Series (or of each column of DataFrame). + count_header : str + Header that will be printed out above non-null counts in output. + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + len_count : int + Length of count header. + count_temp : str + String that can be formatted to include non-null count. + """ + + counts: "Series" + count_header: str + space_count: int + len_count: int + count_temp: str + + def _put_str(s: Union[str, Dtype], space: int) -> str: """ Make string of specified length, padding to the right if necessary. @@ -72,6 +98,134 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" +def _get_count_configs( + counts: "Series", col_space: int, show_counts: bool, col_count: Optional[int] = None +) -> CountConfigs: + """ + Get configs for displaying counts, depending on the value of `show_counts`. + + Parameters + ---------- + counts : Series + Non-null count of Series (or of each column of DataFrame). + col_space : int + How many space to leave between non-null count and dtype columns. + show_counts : bool + Whether to display non-null counts. + col_count : int, optional + Number of columns in DataFrame. + + Returns + ------- + CountConfigs + """ + if show_counts: + if col_count is not None and col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + return CountConfigs(counts, count_header, space_count, len_count, count_temp) + + +def _display_counts_and_dtypes( + lines: List[str], + ids: "Index", + dtypes: "Series", + show_counts: bool, + count_configs: CountConfigs, + space_dtype: int, + space: int = 0, + space_num: int = 0, +) -> None: + """ + Append count and dtype of Series (or of each column of Frame) to `lines`. + + Parameters + ---------- + lines : List[str] + At this stage, this contains the main header and the info table headers. + ids : Index + Series name (or names of DataFrame columns). + dtypes : Series + Series dtype (or dtypes of DataFrame columns). + show_counts : bool + Whether to show non-null counts. + count_configs: CountConfigs + Configs with which to display counts. + space_dtype : int + Number of spaces that `dtypes` column should occupy. + space : int = 0 + Number of spaces that `Column` header should occupy + (including space before `non-null count` column). + space_num : int = 0 + Number of spaces that ` # ` header should occupy (including space + before `Column` column), only applicable for `DataFrame.info`. + """ + for i, col in enumerate(ids): + dtype = dtypes[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = count_configs.counts[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str( + count_configs.count_temp.format(count=count), count_configs.space_count + ) + + _put_str(dtype, space_dtype) + ) + + +def _get_header_and_spaces( + dtypes: "Series", space_count: int, count_header: str, header: str = "" +) -> Tuple[int, str, int]: + """ + Append extra columns (count and type) to header, if applicable. + + Parameters + ---------- + dtypes : Series + Series dtype (or dtypes of DataFrame columns). + space_count : int + Number of spaces that count_header should occupy + (including space before `dtypes` column). + count_header : str + Header that will be printed out above non-null counts in output. + header : str + Current header. + + Returns + ------- + space_dtype : int + Number of spaces that `dtypes` column should occupy. + header : str + Header with extra columns (count and type) appended. + len_dtype : int + Length of dtype header. + """ + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str(dtype_header, space_dtype) + return space_dtype, header, len_dtype + + class BaseInfo(metaclass=ABCMeta): def __init__( self, @@ -297,55 +451,68 @@ def _verbose_repr( space_num = max(max_id, len_id) + col_space header = _put_str(id_head, space_num) + _put_str(column_head, space) - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype + counts = self.data.count() + count_configs = _get_count_configs(counts, col_space, show_counts, col_count) + + space_dtype, header, len_dtype = _get_header_and_spaces( + dtypes, count_configs.space_count, count_configs.count_header, header ) lines.append(header) lines.append( _put_str("-" * len_id, space_num) + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) + + _put_str("-" * count_configs.len_count, count_configs.space_count) + _put_str("-" * len_dtype, space_dtype) ) - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) + _display_counts_and_dtypes( + lines, + ids, + dtypes, + show_counts, + count_configs, + space_dtype, + space, + space_num, + ) - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + lines.append(ids._summary(name="Columns")) - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) + +class SeriesInfo(BaseInfo): + def _get_mem_usage(self, deep: bool) -> int: + return self.data.memory_usage(index=True, deep=deep) + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + ids = Index([self.data.name]) + dtypes = cast("Series", self.data._constructor(self.data.dtypes)) + return ids, dtypes + + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + lines.append(f"Series name: {self.data.name}") + + id_space = 2 + + counts = cast("Series", self.data._constructor(self.data.count())) + count_configs = _get_count_configs(counts, id_space, show_counts) + + space_dtype, header, len_dtype = _get_header_and_spaces( + dtypes, count_configs.space_count, count_configs.count_header + ) + + lines.append(header) + lines.append( + _put_str("-" * count_configs.len_count, count_configs.space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + _display_counts_and_dtypes( + lines, ids, dtypes, show_counts, count_configs, space_dtype, + ) def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - lines.append(ids._summary(name="Columns")) + pass diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 7000daeb9b575..148dddecff401 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -67,8 +67,19 @@ def test_info_categorical_column(): buf = StringIO() df2.info(buf=buf) + s = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) -def test_info(float_frame, datetime_frame): + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_frame(float_frame, datetime_frame): io = StringIO() float_frame.info(buf=io) datetime_frame.info(buf=io) @@ -79,6 +90,32 @@ def test_info(float_frame, datetime_frame): frame.info(verbose=False) +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + expected = """ +MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') +""" + if verbose: + expected += """Series name: sth +Non-Null Count Dtype +-------------- ----- +10 non-null int64 +""" + expected += f"""dtypes: int64(1) +memory usage: {s.memory_usage()}.0+ bytes +""" + result = buf.getvalue() + assert result == expected + + def test_info_verbose(): buf = StringIO() size = 1001 @@ -122,6 +159,25 @@ def test_info_memory(): ) assert result == expected + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + def test_info_wide(): io = StringIO() @@ -140,6 +196,11 @@ def test_info_wide(): assert rs == xp reset_option("display.max_info_columns") + s = Series(np.random.randn(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + def test_info_duplicate_columns(): io = StringIO() @@ -188,6 +249,14 @@ def test_info_shows_column_dtypes(): name = f" {i:d} {i:d} {n:d} non-null {dtype}" assert name in res + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + def test_info_max_cols(): df = DataFrame(np.random.randn(10, 5)) @@ -320,6 +389,14 @@ def test_info_memory_usage_deep_not_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): @@ -332,6 +409,14 @@ def test_info_memory_usage_deep_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): @@ -371,6 +456,26 @@ def test_info_memory_usage_qualified(): df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + s = Series(1, index=[1, 2, 3]) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=list("ABC")) + s.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), range(3)]),) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]]),) + s.info(buf=buf) + assert "+" in buf.getvalue() + def test_info_memory_usage_bug_on_multiindex(): # GH 14308 @@ -393,6 +498,15 @@ def memory_usage(f): # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000 + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + assert unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) < 2000 + def test_info_categorical(): # GH14298 @@ -401,3 +515,8 @@ def test_info_categorical(): buf = StringIO() df.info(buf=buf) + + s = Series(np.zeros((2)), index=idx) + + buf = StringIO() + s.info(buf=buf) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index a69c0ee75eaba..5cb75757a7df0 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -318,12 +318,6 @@ def test_items_strings(self, string_series): # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") - def test_raise_on_info(self): - s = Series(np.random.randn(10)) - msg = "'Series' object has no attribute 'info'" - with pytest.raises(AttributeError, match=msg): - s.info() - def test_copy(self): for deep in [None, False, True]: