Skip to content

Commit 7c76086

Browse files
jseaboldjreback
authored andcommitted
ENH: Make categorical repr nicer.
ENH: Support printing empty categorical. TST: Add tests for categorical printing. REF: Remove unnecessary unicode calls. TST: Hack so tests pass with numpy < 1.7.x CLN: fix DataFrame import in core/categorical.py
1 parent b891d1b commit 7c76086

File tree

4 files changed

+170
-8
lines changed

4 files changed

+170
-8
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ API Changes
233233
This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the same.
234234
Indexing on other index types are preserved (and positional fallback for ``[],ix``), with the exception, that floating point slicing
235235
on indexes on non ``Float64Index`` will raise a ``TypeError``, e.g. ``Series(range(5))[3.5:4.5]`` (:issue:`263`)
236+
- Make Categorical repr nicer (:issue:`4368`)
236237

237238
Internal Refactoring
238239
~~~~~~~~~~~~~~~~~~~~

pandas/core/categorical.py

+47-8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
from pandas.core.base import PandasObject
77
from pandas.core.index import Index
88
import pandas.core.common as com
9+
from pandas.util.terminal import get_terminal_size
10+
from pandas.core.config import get_option
11+
from pandas.core import format as fmt
912

1013

1114
def _cat_compare_op(op):
@@ -133,20 +136,56 @@ def __array__(self, dtype=None):
133136
def __len__(self):
134137
return len(self.labels)
135138

136-
def __unicode__(self):
137-
temp = 'Categorical: %s\n%s\n%s'
138-
values = com.pprint_thing(np.asarray(self))
139-
levheader = 'Levels (%d): ' % len(self.levels)
140-
levstring = np.array_repr(self.levels,
141-
max_line_width=60)
139+
def _tidy_repr(self, max_vals=20):
140+
num = max_vals // 2
141+
head = self[:num]._get_repr(length=False, name=False, footer=False)
142+
tail = self[-(max_vals - num):]._get_repr(length=False,
143+
name=False,
144+
footer=False)
145+
146+
result = '%s\n...\n%s' % (head, tail)
147+
#TODO: tidy_repr for footer since there may be a ton of levels?
148+
result = '%s\n%s' % (result, self._repr_footer())
142149

150+
return result
151+
152+
def _repr_footer(self):
153+
levheader = 'Levels (%d): ' % len(self.levels)
154+
#TODO: should max_line_width respect a setting?
155+
levstring = np.array_repr(self.levels, max_line_width=60)
143156
indent = ' ' * (levstring.find('[') + len(levheader) + 1)
144157
lines = levstring.split('\n')
145158
levstring = '\n'.join([lines[0]] +
146159
[indent + x.lstrip() for x in lines[1:]])
147-
name = '' if self.name is None else self.name
148-
return temp % (name, values, levheader + levstring)
149160

161+
namestr = u"Name: %s, " % com.pprint_thing(
162+
self.name) if self.name is not None else ""
163+
return u'%s\n%sLength: %d' % (levheader + levstring, namestr,
164+
len(self))
165+
166+
def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True):
167+
formatter = fmt.CategoricalFormatter(self, name=name,
168+
length=length, na_rep=na_rep,
169+
footer=footer)
170+
result = formatter.to_string()
171+
return result
172+
173+
def __unicode__(self):
174+
width, height = get_terminal_size()
175+
max_rows = (height if get_option("display.max_rows") == 0
176+
else get_option("display.max_rows"))
177+
if len(self.labels) > (max_rows or 1000):
178+
result = self._tidy_repr(min(30, max_rows) - 4)
179+
elif len(self.labels) > 0:
180+
result = self._get_repr(length=len(self) > 50,
181+
name=True)
182+
else:
183+
result = u'Categorical([], %s' % self._get_repr(name=True,
184+
length=False,
185+
footer=True,
186+
)
187+
188+
return result
150189

151190
def __getitem__(self, key):
152191
if isinstance(key, (int, np.integer)):

pandas/core/format.py

+63
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,69 @@
6161
-------
6262
formatted : string (or unicode, depending on data and options)"""
6363

64+
class CategoricalFormatter(object):
65+
def __init__(self, categorical, buf=None, length=True,
66+
na_rep='NaN', name=False, footer=True):
67+
self.categorical = categorical
68+
self.buf = buf if buf is not None else StringIO(u"")
69+
self.name = name
70+
self.na_rep = na_rep
71+
self.length = length
72+
self.footer = footer
73+
74+
def _get_footer(self):
75+
footer = u''
76+
77+
if self.name:
78+
name = com.pprint_thing(self.categorical.name,
79+
escape_chars=('\t', '\r', '\n'))
80+
footer += ('Name: %s' %
81+
name) if self.categorical.name is not None else ""
82+
83+
if self.length:
84+
if footer:
85+
footer += u', '
86+
footer += "Length: %d" % len(self.categorical)
87+
88+
levheader = 'Levels (%d): ' % len(self.categorical.levels)
89+
90+
#TODO: should max_line_width respect a setting?
91+
levstring = np.array_repr(self.categorical.levels, max_line_width=60)
92+
indent = ' ' * (levstring.find('[') + len(levheader) + 1)
93+
lines = levstring.split('\n')
94+
levstring = '\n'.join([lines[0]] +
95+
[indent + x.lstrip() for x in lines[1:]])
96+
if footer:
97+
footer += u', '
98+
footer += levheader + levstring
99+
100+
return footer
101+
102+
def _get_formatted_values(self):
103+
return format_array(np.asarray(self.categorical), None,
104+
float_format=None,
105+
na_rep=self.na_rep)
106+
107+
def to_string(self):
108+
categorical = self.categorical
109+
110+
if len(categorical) == 0:
111+
if self.footer:
112+
return self._get_footer()
113+
else:
114+
return u''
115+
116+
fmt_values = self._get_formatted_values()
117+
pad_space = 10
118+
119+
result = [u'%s' % i for i in fmt_values]
120+
if self.footer:
121+
footer = self._get_footer()
122+
if footer:
123+
result.append(footer)
124+
125+
return u'\n'.join(result)
126+
64127

65128
class SeriesFormatter(object):
66129

pandas/tests/test_categorical.py

+59
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pandas.compat import range, lrange
55
import unittest
66
import nose
7+
import re
78

89
import numpy as np
910

@@ -123,6 +124,64 @@ def test_describe(self):
123124
).set_index('levels')
124125
tm.assert_frame_equal(desc, expected)
125126

127+
def test_print(self):
128+
expected = [" a", " b", " b", " a", " a", " c", " c", " c",
129+
"Levels (3): Index([a, b, c], dtype=object)"]
130+
expected = "\n".join(expected)
131+
# hack because array_repr changed in numpy > 1.6.x
132+
actual = repr(self.factor)
133+
pat = "Index\(\['a', 'b', 'c']"
134+
sub = "Index([a, b, c]"
135+
actual = re.sub(pat, sub, actual)
136+
137+
self.assertEquals(actual, expected)
138+
139+
def test_big_print(self):
140+
factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat')
141+
expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c",
142+
" a", " b", " c", " a", "...", " c", " a", " b", " c",
143+
" a", " b", " c", " a", " b", " c", " a", " b", " c",
144+
"Levels (3): Index([a, b, c], dtype=object)",
145+
"Name: cat, Length: 600" ]
146+
expected = "\n".join(expected)
147+
148+
# hack because array_repr changed in numpy > 1.6.x
149+
actual = repr(factor)
150+
pat = "Index\(\['a', 'b', 'c']"
151+
sub = "Index([a, b, c]"
152+
actual = re.sub(pat, sub, actual)
153+
154+
self.assertEquals(actual, expected)
155+
156+
def test_empty_print(self):
157+
factor = Categorical([], ["a","b","c"], name="cat")
158+
expected = ("Categorical([], Name: cat, Levels (3): "
159+
"Index([a, b, c], dtype=object)")
160+
# hack because array_repr changed in numpy > 1.6.x
161+
actual = repr(factor)
162+
pat = "Index\(\['a', 'b', 'c']"
163+
sub = "Index([a, b, c]"
164+
actual = re.sub(pat, sub, actual)
165+
166+
self.assertEqual(actual, expected)
167+
168+
factor = Categorical([], ["a","b","c"])
169+
expected = ("Categorical([], Levels (3): "
170+
"Index([a, b, c], dtype=object)")
171+
# hack because array_repr changed in numpy > 1.6.x
172+
actual = repr(factor)
173+
pat = "Index\(\['a', 'b', 'c']"
174+
sub = "Index([a, b, c]"
175+
actual = re.sub(pat, sub, actual)
176+
177+
self.assertEqual(actual, expected)
178+
179+
factor = Categorical([], [])
180+
expected = ("Categorical([], Levels (0): "
181+
"Index([], dtype=object)")
182+
self.assertEqual(repr(factor), expected)
183+
184+
126185
if __name__ == '__main__':
127186
import nose
128187
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)