diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index a9237c239701b..f904781178656 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -32,7 +32,7 @@ check each issue individually, and it's not possible to find the unassigned ones For this reason, we implemented a workaround consisting of adding a comment with the exact text `take`. When you do it, a GitHub action will automatically assign you the issue -(this will take seconds, and may require refreshint the page to see it). +(this will take seconds, and may require refreshing the page to see it). By doing this, it's possible to filter the list of issues and find only the unassigned ones. So, a good way to find an issue to start contributing to pandas is to check the list of diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..327a099259b75 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -54,7 +54,7 @@ Other API changes - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- +- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f3a0cf3841b5b..68c5809479b70 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -607,7 +607,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: Check if full repr fits in horizontal boundaries imposed by the display options width and max_columns. - In case off non-interactive session, no boundaries apply. + In case of non-interactive session, no boundaries apply. `ignore_width` is here so ipnb+HTML output can behave the way users expect. display.max_columns remains in effect. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa21aa452be95..07967e5dad9ec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -72,7 +72,7 @@ class providing the base-class of operations. _apply_docs = dict( template=""" - Apply function `func` group-wise and combine the results together. + Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a {input} as its first argument and return a DataFrame, Series or scalar. `apply` will diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 679d3668523c2..5e13779608dd6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -350,7 +350,7 @@ def get_group_levels(self): def _is_builtin_func(self, arg): """ - if we define an builtin function for this argument, return it, + if we define a builtin function for this argument, return it, otherwise return the arg """ return SelectionMixin._builtin_table.get(arg, arg) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10d9552e6f5a7..cf9e84cab6a84 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Any, Dict, FrozenSet, Hashable, Optional, Union +from typing import Any, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -75,6 +75,7 @@ from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( + PrettyDict, default_pprint, format_object_attrs, format_object_summary, @@ -4765,7 +4766,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values) -> Dict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4790,7 +4791,7 @@ def groupby(self, values) -> Dict[Hashable, np.ndarray]: # map to the label result = {k: self.take(v) for k, v in result.items()} - return result + return PrettyDict(result) def map(self, mapper, na_action=None): """ diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4b5b5e9a0ce15..f8f02f4d4902a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -6,12 +6,14 @@ from typing import ( Any, Callable, + Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, + TypeVar, Union, ) @@ -20,6 +22,8 @@ from pandas.core.dtypes.inference import is_sequence EscapeChars = Union[Mapping[str, str], Iterable[str]] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") def adjoin(space: int, *lists: List[str], **kwargs) -> str: @@ -528,3 +532,10 @@ def format_object_attrs( if len(obj) > max_seq_items: attrs.append(("length", len(obj))) return attrs + + +class PrettyDict(Dict[_KT, _VT]): + """Dict extension to support abbreviated __repr__""" + + def __repr__(self) -> str: + return pprint_thing(self) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eb9552fbbebc1..b7d7124a3a5e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2037,3 +2037,23 @@ def test_groupby_list_level(): expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) result = expected.groupby(level=[0]).mean() tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "max_seq_items, expected", + [ + (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"), + (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"), + ], +) +def test_groups_repr_truncates(max_seq_items, expected): + # GH 1135 + df = pd.DataFrame(np.random.randn(5, 1)) + df["a"] = df.index + + with pd.option_context("display.max_seq_items", max_seq_items): + result = df.groupby("a").groups.__repr__() + assert result == expected + + result = df.groupby(np.array(df.a)).groups.__repr__() + assert result == expected