ENH: Make categorical repr nicer.

jseabold · jreback · commit 7c760868f093 · 2013-09-25T20:38:45.000-04:00
ENH: Support printing empty categorical.

TST: Add tests for categorical printing.

REF: Remove unnecessary unicode calls.

TST: Hack so tests pass with numpy &lt; 1.7.x

CLN: fix DataFrame import in core/categorical.py
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -233,6 +233,7 @@ API Changes
     This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the same.
     Indexing on other index types are preserved (and positional fallback for ``[],ix``), with the exception, that floating point slicing
     on indexes on non ``Float64Index`` will raise a ``TypeError``, e.g. ``Series(range(5))[3.5:4.5]`` (:issue:`263`)
+  - Make Categorical repr nicer (:issue:`4368`)
 
 Internal Refactoring
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -6,6 +6,9 @@
 from pandas.core.base import PandasObject
 from pandas.core.index import Index
 import pandas.core.common as com
+from pandas.util.terminal import get_terminal_size
+from pandas.core.config import get_option
+from pandas.core import format as fmt
 
 
 def _cat_compare_op(op):
@@ -133,20 +136,56 @@ def __array__(self, dtype=None):
     def __len__(self):
         return len(self.labels)
 
-    def __unicode__(self):
-        temp = 'Categorical: %s\n%s\n%s'
-        values = com.pprint_thing(np.asarray(self))
-        levheader = 'Levels (%d): ' % len(self.levels)
-        levstring = np.array_repr(self.levels,
-                                  max_line_width=60)
+    def _tidy_repr(self, max_vals=20):
+        num = max_vals // 2
+        head = self[:num]._get_repr(length=False, name=False, footer=False)
+        tail = self[-(max_vals - num):]._get_repr(length=False,
+                                                  name=False,
+                                                  footer=False)
+
+        result = '%s\n...\n%s' % (head, tail)
+        #TODO: tidy_repr for footer since there may be a ton of levels?
+        result = '%s\n%s' % (result, self._repr_footer())
 
+        return result
+
+    def _repr_footer(self):
+        levheader = 'Levels (%d): ' % len(self.levels)
+        #TODO: should max_line_width respect a setting?
+        levstring = np.array_repr(self.levels, max_line_width=60)
         indent = ' ' * (levstring.find('[') + len(levheader) + 1)
         lines = levstring.split('\n')
         levstring = '\n'.join([lines[0]] +
                               [indent + x.lstrip() for x in lines[1:]])
-        name = '' if self.name is None else self.name
-        return temp % (name, values, levheader + levstring)
 
+        namestr = u"Name: %s, " % com.pprint_thing(
+                        self.name) if self.name is not None else ""
+        return u'%s\n%sLength: %d' % (levheader + levstring, namestr,
+                                      len(self))
+
+    def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True):
+        formatter = fmt.CategoricalFormatter(self, name=name,
+                                        length=length, na_rep=na_rep,
+                                        footer=footer)
+        result = formatter.to_string()
+        return result
+
+    def __unicode__(self):
+        width, height = get_terminal_size()
+        max_rows = (height if get_option("display.max_rows") == 0
+                    else get_option("display.max_rows"))
+        if len(self.labels) > (max_rows or 1000):
+            result = self._tidy_repr(min(30, max_rows) - 4)
+        elif len(self.labels) > 0:
+            result = self._get_repr(length=len(self) > 50,
+                                    name=True)
+        else:
+            result = u'Categorical([], %s' % self._get_repr(name=True,
+                                                            length=False,
+                                                            footer=True,
+                                                            )
+
+        return result
 
     def __getitem__(self, key):
         if isinstance(key, (int, np.integer)):
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -61,6 +61,69 @@
     -------
     formatted : string (or unicode, depending on data and options)"""
 
+class CategoricalFormatter(object):
+    def __init__(self, categorical, buf=None, length=True,
+                 na_rep='NaN', name=False, footer=True):
+        self.categorical = categorical
+        self.buf = buf if buf is not None else StringIO(u"")
+        self.name = name
+        self.na_rep = na_rep
+        self.length = length
+        self.footer = footer
+
+    def _get_footer(self):
+        footer = u''
+
+        if self.name:
+            name = com.pprint_thing(self.categorical.name,
+                                    escape_chars=('\t', '\r', '\n'))
+            footer += ('Name: %s' %
+                        name) if self.categorical.name is not None else ""
+
+        if self.length:
+            if footer:
+                footer += u', '
+            footer += "Length: %d" % len(self.categorical)
+
+        levheader = 'Levels (%d): ' % len(self.categorical.levels)
+
+        #TODO: should max_line_width respect a setting?
+        levstring = np.array_repr(self.categorical.levels, max_line_width=60)
+        indent = ' ' * (levstring.find('[') + len(levheader) + 1)
+        lines = levstring.split('\n')
+        levstring = '\n'.join([lines[0]] +
+                              [indent + x.lstrip() for x in lines[1:]])
+        if footer:
+            footer += u', '
+        footer += levheader + levstring
+
+        return footer
+
+    def _get_formatted_values(self):
+        return format_array(np.asarray(self.categorical), None,
+                            float_format=None,
+                            na_rep=self.na_rep)
+
+    def to_string(self):
+        categorical = self.categorical
+
+        if len(categorical) == 0:
+            if self.footer:
+                return self._get_footer()
+            else:
+                return u''
+
+        fmt_values = self._get_formatted_values()
+        pad_space = 10
+
+        result = [u'%s' % i for i in fmt_values]
+        if self.footer:
+            footer = self._get_footer()
+            if footer:
+                result.append(footer)
+
+        return u'\n'.join(result)
+
 
 class SeriesFormatter(object):
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -4,6 +4,7 @@
 from pandas.compat import range, lrange
 import unittest
 import nose
+import re
 
 import numpy as np
 
@@ -123,6 +124,64 @@ def test_describe(self):
                                             ).set_index('levels')
         tm.assert_frame_equal(desc, expected)
 
+    def test_print(self):
+        expected = [" a", " b", " b", " a", " a", " c", " c", " c",
+                    "Levels (3): Index([a, b, c], dtype=object)"]
+        expected = "\n".join(expected)
+        # hack because array_repr changed in numpy > 1.6.x
+        actual = repr(self.factor)
+        pat = "Index\(\['a', 'b', 'c']"
+        sub = "Index([a, b, c]"
+        actual = re.sub(pat, sub, actual)
+
+        self.assertEquals(actual, expected)
+
+    def test_big_print(self):
+        factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat')
+        expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c",
+                    " a", " b", " c", " a", "...", " c", " a", " b", " c",
+                    " a", " b", " c", " a", " b", " c", " a", " b", " c",
+                    "Levels (3): Index([a, b, c], dtype=object)",
+                    "Name: cat, Length: 600" ]
+        expected = "\n".join(expected)
+
+        # hack because array_repr changed in numpy > 1.6.x
+        actual = repr(factor)
+        pat = "Index\(\['a', 'b', 'c']"
+        sub = "Index([a, b, c]"
+        actual = re.sub(pat, sub, actual)
+
+        self.assertEquals(actual, expected)
+
+    def test_empty_print(self):
+        factor = Categorical([], ["a","b","c"], name="cat")
+        expected = ("Categorical([], Name: cat, Levels (3): "
+                    "Index([a, b, c], dtype=object)")
+        # hack because array_repr changed in numpy > 1.6.x
+        actual = repr(factor)
+        pat = "Index\(\['a', 'b', 'c']"
+        sub = "Index([a, b, c]"
+        actual = re.sub(pat, sub, actual)
+
+        self.assertEqual(actual, expected)
+
+        factor = Categorical([], ["a","b","c"])
+        expected = ("Categorical([], Levels (3): "
+                    "Index([a, b, c], dtype=object)")
+        # hack because array_repr changed in numpy > 1.6.x
+        actual = repr(factor)
+        pat = "Index\(\['a', 'b', 'c']"
+        sub = "Index([a, b, c]"
+        actual = re.sub(pat, sub, actual)
+
+        self.assertEqual(actual, expected)
+
+        factor = Categorical([], [])
+        expected = ("Categorical([], Levels (0): "
+                    "Index([], dtype=object)")
+        self.assertEqual(repr(factor), expected)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],