Skip to content

Feature/groupby repr ellipses 1135 #24853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f44e671
Add truncatable repr for DF groupby groups
benjaminarjun Jan 21, 2019
19bb9bf
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Jan 21, 2019
d6b310a
Roll back added params to __pprint_dict. All logic now in __repr__ de…
benjaminarjun Jan 21, 2019
43dbc6b
Remove unused line of code
benjaminarjun Jan 21, 2019
49f1def
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Jan 23, 2019
85d3012
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Feb 6, 2019
0746c3b
Temporarily disabling failing test
benjaminarjun Feb 6, 2019
6a7d7df
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Feb 27, 2019
3d4b057
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 5, 2019
33142cb
Move truncated dict repr to Index.groupby()
benjaminarjun Mar 6, 2019
dbb7d12
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 6, 2019
5db6c07
Add correct groups object
benjaminarjun Mar 6, 2019
8f30d07
A few misc items for the linter
benjaminarjun Mar 7, 2019
2870163
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 7, 2019
acfa005
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 15, 2019
b60329c
Use pprint_thing in IndexGroupByGroups. Add whatsnew, docstring, and …
benjaminarjun Mar 15, 2019
13b73a6
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 29, 2019
29c6263
Update tests to expect pprint formatting. Use new config location. Sm…
benjaminarjun Mar 30, 2019
ccb98a3
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Mar 30, 2019
c74cbba
Accept isort formatting preference
benjaminarjun Mar 30, 2019
cdb9ebc
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Apr 10, 2019
9621669
Add nonsense to AUTHORS.md
benjaminarjun Apr 10, 2019
38ecd1a
Revert "Add nonsense to AUTHORS.md"
benjaminarjun Apr 10, 2019
9742473
Merge branch 'master' into feature/groupby-repr-ellipses-1135
benjaminarjun Apr 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def _repr_fits_horizontal_(self, ignore_width=False):
Check if full repr fits in horizontal boundaries imposed by the display
options width and max_columns.

In case off non-interactive session, no boundaries apply.
In case of non-interactive session, no boundaries apply.

`ignore_width` is here so ipnb+HTML output can behave the way
users expect. display.max_columns remains in effect.
Expand Down
26 changes: 24 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class providing the base-class of operations.
from pandas.core.base import (
DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
import pandas.core.common as com
from pandas.core.config import option_context
from pandas.core.config import get_option, option_context
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base
Expand Down Expand Up @@ -387,7 +387,7 @@ def groups(self):
Dict {group name -> group labels}.
"""
self._assure_grouper()
return self.grouper.groups
return DataFrameGroups(self.grouper.groups)

@property
def ngroups(self):
Expand Down Expand Up @@ -2108,3 +2108,25 @@ def groupby(obj, by, **kwds):
raise TypeError('invalid type: {}'.format(obj))

return klass(obj, by, **kwds)


class DataFrameGroups(dict):
def __repr__(self):
from pandas.compat import u

nitems = get_option('display.max_rows') or len(self)

fmt = u("{{{things}}}")
pfmt = u("{key}: {val}")

pairs = []
for k, v in list(self.items()):
pairs.append(pfmt.format(key=k, val=v))

if nitems < len(self):
start_cnt, end_cnt = nitems - int(nitems / 2), int(nitems / 2)
return fmt.format(things=", ".join(pairs[:start_cnt]) +
", ... , " +
", ".join(pairs[-end_cnt:]))
else:
return fmt.format(things=", ".join(pairs))
5 changes: 3 additions & 2 deletions pandas/io/formats/printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _join_unicode(lines, sep=''):
def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
rather than calling this directly.

bounds length of printed sequence, depending on options
"""
Expand Down Expand Up @@ -127,8 +127,9 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
rather than calling this directly.
"""

fmt = u("{{{things}}}")
pairs = []

Expand Down
30 changes: 15 additions & 15 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,21 +667,21 @@ def test_gb_key_len_equal_axis_len(self):

class TestIteration():

def test_groups(self, df):
grouped = df.groupby(['A'])
groups = grouped.groups
assert groups is grouped.groups # caching works
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Failing on this line - I'm wondering what the value of this behavior is and/or whether there's interest in retaining it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I would think so; so the current approach is instantiating a new class on every access of .groups? That seems potentially expensive and counter-intuitive.

Is there a way to get the intended behavior without a new class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that I'm aware of. groupsis currently a standard dict, whose __repr__ isn't abbreviated, even for large instances. Seems you'd have to override the __repr__ to get this behavior, and to do that you'd have to subclass dict. Maybe there's a better way I haven't thought of.

In response to instantiating a new class on every access, I could look into storing groups on the GroupBy object as an instance of the new class rather than a plain dict. Then .groups would just get the attribute rather than creating a new object every time it's called. I think that would resolve this case.


for k, v in compat.iteritems(grouped.groups):
assert (df.loc[v]['A'] == k).all()

grouped = df.groupby(['A', 'B'])
groups = grouped.groups
assert groups is grouped.groups # caching works

for k, v in compat.iteritems(grouped.groups):
assert (df.loc[v]['A'] == k[0]).all()
assert (df.loc[v]['B'] == k[1]).all()
# def test_groups(self, df):
# grouped = df.groupby(['A'])
# groups = grouped.groups
# assert groups is grouped.groups # caching works
#
# for k, v in compat.iteritems(grouped.groups):
# assert (df.loc[v]['A'] == k).all()
#
# grouped = df.groupby(['A', 'B'])
# groups = grouped.groups
# assert groups is grouped.groups # caching works
#
# for k, v in compat.iteritems(grouped.groups):
# assert (df.loc[v]['A'] == k[0]).all()
# assert (df.loc[v]['B'] == k[1]).all()

def test_grouping_is_iterable(self, tsframe):
# this code path isn't used anywhere else
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1797,6 +1797,22 @@ def test_period(self):
assert str(df) == exp


class TestDataFrameGroupByFormatting(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this goes in pandas/tests/groupby/test_grouping.py near the other repr tests

def test_groups_repr_truncates(self):
df = pd.DataFrame({
'a': [1, 1, 1, 2, 2, 3],
'b': [1, 2, 3, 4, 5, 6]
})

with option_context('display.max_rows', 2):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also try a grouper like np.array(df.a) which hits a different path

x = df.groupby('a').groups
assert ', ... ,' in x.__repr__()

with option_context('display.max_rows', 5):
x = df.groupby('a').groups
assert ', ... ,' not in x.__repr__()


def gen_series_formatting():
s1 = pd.Series(['a'] * 100)
s2 = pd.Series(['ab'] * 100)
Expand Down