ENH: truncate output of Groupby.groups (#31388)

MarcoGorelli · web-flow · commit addfd7a18c88 · 2020-01-30T23:06:38.000-05:00
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
@@ -32,7 +32,7 @@ check each issue individually, and it's not possible to find the unassigned ones
 
 For this reason, we implemented a workaround consisting of adding a comment with the exact
 text `take`. When you do it, a GitHub action will automatically assign you the issue
-(this will take seconds, and may require refreshint the page to see it).
+(this will take seconds, and may require refreshing the page to see it).
 By doing this, it's possible to filter the list of issues and find only the unassigned ones.
 
 So, a good way to find an issue to start contributing to pandas is to check the list of
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -54,7 +54,7 @@ Other API changes
 
 - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
   will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
--
+- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
 
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -607,7 +607,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
         Check if full repr fits in horizontal boundaries imposed by the display
         options width and max_columns.
 
-        In case off non-interactive session, no boundaries apply.
+        In case of non-interactive session, no boundaries apply.
 
         `ignore_width` is here so ipnb+HTML output can behave the way
         users expect. display.max_columns remains in effect.
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -72,7 +72,7 @@ class providing the base-class of operations.
 
 _apply_docs = dict(
     template="""
-    Apply function `func`  group-wise and combine the results together.
+    Apply function `func` group-wise and combine the results together.
 
     The function passed to `apply` must take a {input} as its first
     argument and return a DataFrame, Series or scalar. `apply` will
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -350,7 +350,7 @@ def get_group_levels(self):
 
     def _is_builtin_func(self, arg):
         """
-        if we define an builtin function for this argument, return it,
+        if we define a builtin function for this argument, return it,
         otherwise return the arg
         """
         return SelectionMixin._builtin_table.get(arg, arg)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -1,7 +1,7 @@
 from datetime import datetime
 import operator
 from textwrap import dedent
-from typing import Any, Dict, FrozenSet, Hashable, Optional, Union
+from typing import Any, FrozenSet, Hashable, Optional, Union
 import warnings
 
 import numpy as np
@@ -76,6 +76,7 @@
 from pandas.core.strings import StringMethods
 
 from pandas.io.formats.printing import (
+    PrettyDict,
     default_pprint,
     format_object_attrs,
     format_object_summary,
@@ -4783,7 +4784,7 @@ def _maybe_promote(self, other):
                 return self.astype("object"), other.astype("object")
         return self, other
 
-    def groupby(self, values) -> Dict[Hashable, np.ndarray]:
+    def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]:
         """
         Group the index labels by a given array of values.
 
@@ -4808,7 +4809,7 @@ def groupby(self, values) -> Dict[Hashable, np.ndarray]:
         # map to the label
         result = {k: self.take(v) for k, v in result.items()}
 
-        return result
+        return PrettyDict(result)
 
     def map(self, mapper, na_action=None):
         """
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
@@ -6,12 +6,14 @@
 from typing import (
     Any,
     Callable,
+    Dict,
     Iterable,
     List,
     Mapping,
     Optional,
     Sequence,
     Tuple,
+    TypeVar,
     Union,
 )
 
@@ -20,6 +22,8 @@
 from pandas.core.dtypes.inference import is_sequence
 
 EscapeChars = Union[Mapping[str, str], Iterable[str]]
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
 
 
 def adjoin(space: int, *lists: List[str], **kwargs) -> str:
@@ -528,3 +532,10 @@ def format_object_attrs(
     if len(obj) > max_seq_items:
         attrs.append(("length", len(obj)))
     return attrs
+
+
+class PrettyDict(Dict[_KT, _VT]):
+    """Dict extension to support abbreviated __repr__"""
+
+    def __repr__(self) -> str:
+        return pprint_thing(self)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2037,3 +2037,23 @@ def test_groupby_list_level():
     expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3))
     result = expected.groupby(level=[0]).mean()
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "max_seq_items, expected",
+    [
+        (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
+        (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
+    ],
+)
+def test_groups_repr_truncates(max_seq_items, expected):
+    # GH 1135
+    df = pd.DataFrame(np.random.randn(5, 1))
+    df["a"] = df.index
+
+    with pd.option_context("display.max_seq_items", max_seq_items):
+        result = df.groupby("a").groups.__repr__()
+        assert result == expected
+
+        result = df.groupby(np.array(df.a)).groups.__repr__()
+        assert result == expected