From 2985c313c9db61687130e0b7c1529ff5f91c3d9a Mon Sep 17 00:00:00 2001 From: Ben Date: Sun, 20 Jan 2019 16:02:59 -0800 Subject: [PATCH 1/5] Add truncatable repr for DF groupby groups Roll back added params to __pprint_dict. All logic now in __repr__ def. Make tests more general Remove unused line of code Move truncated dict repr to Index.groupby() Add correct groups object A few misc items for the linter Use pprint_thing in IndexGroupByGroups. Add whatsnew, docstring, and a couple typo fixes Update tests to expect pprint formatting. Use new config location. Small update in doc. Add nonsense to AUTHORS.md Revert "Add nonsense to AUTHORS.md" This reverts commit 9621669be1e6aa66587cc0871919aa558f01365d. --- pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 25 ++++++++++++++++++++++++- pandas/core/groupby/ops.py | 3 ++- pandas/core/indexes/base.py | 11 ++++++++++- pandas/io/formats/printing.py | 4 ++-- pandas/tests/io/formats/test_format.py | 16 ++++++++++++++++ 6 files changed, 55 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f3a0cf3841b5b..68c5809479b70 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -607,7 +607,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: Check if full repr fits in horizontal boundaries imposed by the display options width and max_columns. - In case off non-interactive session, no boundaries apply. + In case of non-interactive session, no boundaries apply. `ignore_width` is here so ipnb+HTML output can behave the way users expect. display.max_columns remains in effect. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa21aa452be95..f55ff4617b2a4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -56,6 +56,7 @@ class providing the base-class of operations. from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com +from pandas.core.config import option_context from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops @@ -72,7 +73,7 @@ class providing the base-class of operations. _apply_docs = dict( template=""" - Apply function `func` group-wise and combine the results together. + Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a {input} as its first argument and return a DataFrame, Series or scalar. `apply` will @@ -2567,3 +2568,25 @@ def get_groupby( observed=observed, mutated=mutated, ) + + +class DataFrameGroups(dict): + def __repr__(self): + from pandas.compat import u + + nitems = get_option('display.max_rows') or len(self) + + fmt = u("{{{things}}}") + pfmt = u("{key}: {val}") + + pairs = [] + for k, v in list(self.items()): + pairs.append(pfmt.format(key=k, val=v)) + + if nitems < len(self): + start_cnt, end_cnt = nitems - int(nitems / 2), int(nitems / 2) + return fmt.format(things=", ".join(pairs[:start_cnt]) + + ", ... , " + + ", ".join(pairs[-end_cnt:])) + else: + return fmt.format(things=", ".join(pairs)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 679d3668523c2..3e61262802fb0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -246,6 +246,7 @@ def size(self) -> Series: @cache_readonly def groups(self): """ dict {group name -> group labels} """ + if len(self.groupings) == 1: return self.groupings[0].groups else: @@ -350,7 +351,7 @@ def get_group_levels(self): def _is_builtin_func(self, arg): """ - if we define an builtin function for this argument, return it, + if we define a builtin function for this argument, return it, otherwise return the arg """ return SelectionMixin._builtin_table.get(arg, arg) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 10d9552e6f5a7..0299599254277 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -68,6 +68,7 @@ from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.indexers import maybe_convert_indices +from pandas.core.config import get_option from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -4790,7 +4791,7 @@ def groupby(self, values) -> Dict[Hashable, np.ndarray]: # map to the label result = {k: self.take(v) for k, v in result.items()} - return result + return IndexGroupbyGroups(result) def map(self, mapper, na_action=None): """ @@ -5501,6 +5502,14 @@ def shape(self): Index._add_comparison_methods() +class IndexGroupbyGroups(dict): + """Dict extension to support abbreviated __repr__""" + from pandas.io.formats.printing import pprint_thing + + def __repr__(self): + return pprint_thing(self, max_seq_items=get_option('display.max_rows')) + + def ensure_index_from_sequences(sequences, names=None): """ Construct an index from sequences of data. diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4b5b5e9a0ce15..a0ee759e3b8b1 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -98,7 +98,7 @@ def _pprint_seq( ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() - rather then calling this directly. + rather than calling this directly. bounds length of printed sequence, depending on options """ @@ -133,7 +133,7 @@ def _pprint_dict( ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() - rather then calling this directly. + rather than calling this directly. """ fmt = "{{{things}}}" pairs = [] diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 97956489e7da6..e7ffc4cf4ba13 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2128,6 +2128,22 @@ def test_period(self): assert str(df) == exp +class TestDataFrameGroupByFormatting(object): + def test_groups_repr_truncates(self): + df = pd.DataFrame({ + 'a': [1, 1, 1, 2, 2, 3], + 'b': [1, 2, 3, 4, 5, 6] + }) + + with option_context('display.max_rows', 2): + x = df.groupby('a').groups + assert x.__repr__().endswith('...}') + + with option_context('display.max_rows', 5): + x = df.groupby('a').groups + assert not x.__repr__().endswith('...}') + + def gen_series_formatting(): s1 = pd.Series(["a"] * 100) s2 = pd.Series(["ab"] * 100) From 7064f09ea387fecca65bbffa76c6d855b901a27a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jan 2020 10:44:43 +0000 Subject: [PATCH 2/5] Fix conflicts, move classes according to previous review --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/groupby.py | 23 ----------------------- pandas/core/groupby/ops.py | 1 - pandas/core/indexes/base.py | 12 ++---------- pandas/io/formats/printing.py | 7 +++++++ pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ pandas/tests/io/formats/test_format.py | 16 ---------------- 7 files changed, 23 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 920919755dc23..327a099259b75 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -54,7 +54,7 @@ Other API changes - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- +- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f55ff4617b2a4..07967e5dad9ec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -56,7 +56,6 @@ class providing the base-class of operations. from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.config import option_context from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, ops @@ -2568,25 +2567,3 @@ def get_groupby( observed=observed, mutated=mutated, ) - - -class DataFrameGroups(dict): - def __repr__(self): - from pandas.compat import u - - nitems = get_option('display.max_rows') or len(self) - - fmt = u("{{{things}}}") - pfmt = u("{key}: {val}") - - pairs = [] - for k, v in list(self.items()): - pairs.append(pfmt.format(key=k, val=v)) - - if nitems < len(self): - start_cnt, end_cnt = nitems - int(nitems / 2), int(nitems / 2) - return fmt.format(things=", ".join(pairs[:start_cnt]) + - ", ... , " + - ", ".join(pairs[-end_cnt:])) - else: - return fmt.format(things=", ".join(pairs)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3e61262802fb0..5e13779608dd6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -246,7 +246,6 @@ def size(self) -> Series: @cache_readonly def groups(self): """ dict {group name -> group labels} """ - if len(self.groupings) == 1: return self.groupings[0].groups else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0299599254277..4dd5c9826165c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -68,7 +68,6 @@ from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.indexers import maybe_convert_indices -from pandas.core.config import get_option from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -76,6 +75,7 @@ from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( + PrettyDict, default_pprint, format_object_attrs, format_object_summary, @@ -4791,7 +4791,7 @@ def groupby(self, values) -> Dict[Hashable, np.ndarray]: # map to the label result = {k: self.take(v) for k, v in result.items()} - return IndexGroupbyGroups(result) + return PrettyDict(result) def map(self, mapper, na_action=None): """ @@ -5502,14 +5502,6 @@ def shape(self): Index._add_comparison_methods() -class IndexGroupbyGroups(dict): - """Dict extension to support abbreviated __repr__""" - from pandas.io.formats.printing import pprint_thing - - def __repr__(self): - return pprint_thing(self, max_seq_items=get_option('display.max_rows')) - - def ensure_index_from_sequences(sequences, names=None): """ Construct an index from sequences of data. diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a0ee759e3b8b1..a8b9db28dbe14 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -528,3 +528,10 @@ def format_object_attrs( if len(obj) > max_seq_items: attrs.append(("length", len(obj))) return attrs + + +class PrettyDict(dict): + """Dict extension to support abbreviated __repr__""" + + def __repr__(self): + return pprint_thing(self, max_seq_items=get_option("display.max_rows")) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eb9552fbbebc1..16af4fcdcfed3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2037,3 +2037,16 @@ def test_groupby_list_level(): expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) result = expected.groupby(level=[0]).mean() tm.assert_frame_equal(result, expected) + + +def test_groups_repr_truncates(): + # GH 1135 + df = pd.DataFrame({"a": [1, 1, 1, 2, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) + + with pd.option_context("display.max_rows", 2): + x = df.groupby("a").groups + assert x.__repr__().endswith("...}") + + with pd.option_context("display.max_rows", 5): + x = df.groupby(np.array(df.a)).groups + assert not x.__repr__().endswith("...}") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e7ffc4cf4ba13..97956489e7da6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2128,22 +2128,6 @@ def test_period(self): assert str(df) == exp -class TestDataFrameGroupByFormatting(object): - def test_groups_repr_truncates(self): - df = pd.DataFrame({ - 'a': [1, 1, 1, 2, 2, 3], - 'b': [1, 2, 3, 4, 5, 6] - }) - - with option_context('display.max_rows', 2): - x = df.groupby('a').groups - assert x.__repr__().endswith('...}') - - with option_context('display.max_rows', 5): - x = df.groupby('a').groups - assert not x.__repr__().endswith('...}') - - def gen_series_formatting(): s1 = pd.Series(["a"] * 100) s2 = pd.Series(["ab"] * 100) From 44a68e50f6db56c2334fad29d71f0eb97521edbb Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jan 2020 11:23:01 +0000 Subject: [PATCH 3/5] fix minor typo in contributing --- doc/source/development/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index a9237c239701b..f904781178656 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -32,7 +32,7 @@ check each issue individually, and it's not possible to find the unassigned ones For this reason, we implemented a workaround consisting of adding a comment with the exact text `take`. When you do it, a GitHub action will automatically assign you the issue -(this will take seconds, and may require refreshint the page to see it). +(this will take seconds, and may require refreshing the page to see it). By doing this, it's possible to filter the list of issues and find only the unassigned ones. So, a good way to find an issue to start contributing to pandas is to check the list of From 5eb98e49ab4acd4ccd80ed49332c56d3027b2f0a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jan 2020 14:22:02 +0000 Subject: [PATCH 4/5] rewrite tests, typing --- pandas/core/indexes/base.py | 4 ++-- pandas/io/formats/printing.py | 8 ++++++-- pandas/tests/groupby/test_groupby.py | 14 +++++++------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4dd5c9826165c..3f3684a5bb059 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Any, Dict, FrozenSet, Hashable, Optional, Union +from typing import Any, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -4766,7 +4766,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values) -> Dict[Hashable, np.ndarray]: + def groupby(self, values) -> PrettyDict: """ Group the index labels by a given array of values. diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a8b9db28dbe14..ff03c8127ee19 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -6,12 +6,14 @@ from typing import ( Any, Callable, + Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, + TypeVar, Union, ) @@ -20,6 +22,8 @@ from pandas.core.dtypes.inference import is_sequence EscapeChars = Union[Mapping[str, str], Iterable[str]] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") def adjoin(space: int, *lists: List[str], **kwargs) -> str: @@ -530,8 +534,8 @@ def format_object_attrs( return attrs -class PrettyDict(dict): +class PrettyDict(Dict[_KT, _VT]): """Dict extension to support abbreviated __repr__""" - def __repr__(self): + def __repr__(self) -> str: return pprint_thing(self, max_seq_items=get_option("display.max_rows")) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 16af4fcdcfed3..3921990d1b3ad 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2041,12 +2041,12 @@ def test_groupby_list_level(): def test_groups_repr_truncates(): # GH 1135 - df = pd.DataFrame({"a": [1, 1, 1, 2, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) + df = pd.DataFrame(np.random.randn(61, 1)) + df["a"] = df.index - with pd.option_context("display.max_rows", 2): - x = df.groupby("a").groups - assert x.__repr__().endswith("...}") + result = df.groupby("a").groups.__repr__() + expected = str({i: [i] for i in range(60)})[:-1] + ", ...}" + assert result == expected - with pd.option_context("display.max_rows", 5): - x = df.groupby(np.array(df.a)).groups - assert not x.__repr__().endswith("...}") + result = df.groupby(np.array(df.a)).groups.__repr__() + assert result == expected From b6b9751159b044e8933568436282ab26b085d00b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jan 2020 18:52:24 +0000 Subject: [PATCH 5/5] parametrize test, allow max_seq_items to determine truncation --- pandas/core/indexes/base.py | 2 +- pandas/io/formats/printing.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 21 ++++++++++++++------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3f3684a5bb059..cf9e84cab6a84 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4766,7 +4766,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values) -> PrettyDict: + def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index ff03c8127ee19..f8f02f4d4902a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -102,7 +102,7 @@ def _pprint_seq( ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() - rather than calling this directly. + rather then calling this directly. bounds length of printed sequence, depending on options """ @@ -137,7 +137,7 @@ def _pprint_dict( ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() - rather than calling this directly. + rather then calling this directly. """ fmt = "{{{things}}}" pairs = [] @@ -538,4 +538,4 @@ class PrettyDict(Dict[_KT, _VT]): """Dict extension to support abbreviated __repr__""" def __repr__(self) -> str: - return pprint_thing(self, max_seq_items=get_option("display.max_rows")) + return pprint_thing(self) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3921990d1b3ad..b7d7124a3a5e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2039,14 +2039,21 @@ def test_groupby_list_level(): tm.assert_frame_equal(result, expected) -def test_groups_repr_truncates(): +@pytest.mark.parametrize( + "max_seq_items, expected", + [ + (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"), + (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"), + ], +) +def test_groups_repr_truncates(max_seq_items, expected): # GH 1135 - df = pd.DataFrame(np.random.randn(61, 1)) + df = pd.DataFrame(np.random.randn(5, 1)) df["a"] = df.index - result = df.groupby("a").groups.__repr__() - expected = str({i: [i] for i in range(60)})[:-1] + ", ...}" - assert result == expected + with pd.option_context("display.max_seq_items", max_seq_items): + result = df.groupby("a").groups.__repr__() + assert result == expected - result = df.groupby(np.array(df.a)).groups.__repr__() - assert result == expected + result = df.groupby(np.array(df.a)).groups.__repr__() + assert result == expected