fixup for CategoricalIndex merge

jreback · jreback · commit c46bd618c695 · 2015-05-08T14:26:04.000-04:00
increase limits for max_seq_items &amp; printing for Index

add extended repr for datetimelike indexes

fix tseries/test_base for repr

adjust docs for repr-name

use new format_data on all Index types
diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
@@ -675,10 +675,7 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index.
                                }).set_index('B')
 
       In [11]: df3.index
-      Out[11]:
-      CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'],
-                       categories=[u'a', u'b', u'c'],
-                       ordered=False)
+      Out[11]: CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], categories=[u'a', u'b', u'c'], ordered=False, name=u'B', dtype='category')
 
       In [12]: pd.concat([df2,df3]
       TypeError: categories must match existing categories when appending
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -254,15 +254,13 @@ API changes
 Index Representation
 ~~~~~~~~~~~~~~~~~~~~
 
-The string representation of ``Index`` and its sub-classes have now been unified. These are all uniform in their output
-formats, except for ``MultiIndex``, which has a multi-line repr. The display width responds to the option ``display.max_seq_len``,
-which is now defaulted to 10 (previously was 100). (:issue:`6482`)
+The string representation of ``Index`` and its sub-classes have now been unified. ``Index, Int64Index, Float64Index, CategoricalIndex`` are single-line display. The datetimelikes ``DatetimeIndex, PeriodIndex, TimedeltaIndex`` & ``MultiIndex`` will display in a multi-line format showing much more of the index values. The display width responds to the option ``display.max_seq_items``,
+which is now defaulted to 20 (previously was 100). (:issue:`6482`)
 
 Previous Behavior
 
 .. code-block:: python
 
-
    In [1]: pd.get_option('max_seq_items')
    Out[1]: 100
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -3132,7 +3132,7 @@ def in_ipython_frontend():
 #    working with straight ascii.
 
 
-def _pprint_seq(seq, _nest_lvl=0, **kwds):
+def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
     """
     internal. pprinter for iterables. you should probably use pprint_thing()
     rather then calling this directly.
@@ -3144,12 +3144,15 @@ def _pprint_seq(seq, _nest_lvl=0, **kwds):
     else:
         fmt = u("[%s]") if hasattr(seq, '__setitem__') else u("(%s)")
 
-    nitems = get_option("max_seq_items") or len(seq)
+    if max_seq_items is False:
+        nitems = len(seq)
+    else:
+        nitems = max_seq_items or get_option("max_seq_items") or len(seq)
 
     s = iter(seq)
     r = []
     for i in range(min(nitems, len(seq))):  # handle sets, no slicing
-        r.append(pprint_thing(next(s), _nest_lvl + 1, **kwds))
+        r.append(pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))
     body = ", ".join(r)
 
     if nitems < len(seq):
@@ -3160,7 +3163,7 @@ def _pprint_seq(seq, _nest_lvl=0, **kwds):
     return fmt % body
 
 
-def _pprint_dict(seq, _nest_lvl=0, **kwds):
+def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
     """
     internal. pprinter for iterables. you should probably use pprint_thing()
     rather then calling this directly.
@@ -3170,11 +3173,14 @@ def _pprint_dict(seq, _nest_lvl=0, **kwds):
 
     pfmt = u("%s: %s")
 
-    nitems = get_option("max_seq_items") or len(seq)
+    if max_seq_items is False:
+        nitems = len(seq)
+    else:
+        nitems = max_seq_items or get_option("max_seq_items") or len(seq)
 
     for k, v in list(seq.items())[:nitems]:
-        pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, **kwds),
-                             pprint_thing(v, _nest_lvl + 1, **kwds)))
+        pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
+                             pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)))
 
     if nitems < len(seq):
         return fmt % (", ".join(pairs) + ", ...")
@@ -3183,7 +3189,7 @@ def _pprint_dict(seq, _nest_lvl=0, **kwds):
 
 
 def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
-                 quote_strings=False):
+                 quote_strings=False, max_seq_items=None):
     """
     This function is the sanctioned way of converting objects
     to a unicode representation.
@@ -3202,6 +3208,8 @@ def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
         replacements
     default_escapes : bool, default False
         Whether the input escape characters replaces or adds to the defaults
+    max_seq_items : False, int, default None
+        Pass thru to other pretty printers to limit sequence printing
 
     Returns
     -------
@@ -3240,11 +3248,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars):
         return compat.text_type(thing)
     elif (isinstance(thing, dict) and
           _nest_lvl < get_option("display.pprint_nest_depth")):
-        result = _pprint_dict(thing, _nest_lvl, quote_strings=True)
+        result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items)
     elif is_sequence(thing) and _nest_lvl < \
             get_option("display.pprint_nest_depth"):
         result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
-                             quote_strings=quote_strings)
+                             quote_strings=quote_strings, max_seq_items=max_seq_items)
     elif isinstance(thing, compat.string_types) and quote_strings:
         if compat.PY3:
             fmt = "'%s'"
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -269,7 +269,7 @@ def mpl_style_cb(key):
     cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc,
                        validator=is_one_of_factory([True, False, 'truncate']))
     cf.register_option('chop_threshold', None, pc_chop_threshold_doc)
-    cf.register_option('max_seq_items', 10, pc_max_seq_items)
+    cf.register_option('max_seq_items', 20, pc_max_seq_items)
     cf.register_option('mpl_style', None, pc_mpl_style_doc,
                        validator=is_one_of_factory([None, False, 'default']),
                        cb=mpl_style_cb)
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -26,8 +26,10 @@
 from pandas.io.common import PerformanceWarning
 
 # simplify
-default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'),
-                                            quote_strings=True)
+default_pprint = lambda x, max_seq_items=None: com.pprint_thing(x,
+                                                                escape_chars=('\t', '\r', '\n'),
+                                                                quote_strings=True,
+                                                                max_seq_items=max_seq_items)
 
 
 __all__ = ['Index']
@@ -430,6 +432,37 @@ def _formatter_func(self):
         return default_pprint
 
     def _format_data(self):
+        """
+        Return the formatted data as a unicode string
+        """
+        space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2))
+        space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
+        sep = ',%s' % space1
+        max_seq_items = get_option('display.max_seq_items')
+        formatter = self._formatter_func
+        n = len(self)
+        if n == 0:
+            summary = '[]'
+        elif n == 1:
+            first = formatter(self[0])
+            summary = '[%s]' % first
+        elif n == 2:
+            first = formatter(self[0])
+            last = formatter(self[-1])
+            summary = '[%s%s%s]' % (first, sep, last)
+        elif n > max_seq_items:
+            n = min(max_seq_items//2,10)
+
+            head = sep.join([ formatter(x) for x in self[:n] ])
+            tail = sep.join([ formatter(x) for x in self[-n:] ])
+            summary = '[%s%s...%s%s]' % (head, space1, space1, tail)
+        else:
+            values = sep.join([ formatter(x) for x in self ])
+            summary = '[%s]' % (values)
+
+        return summary
+
+    def _format_data2(self):
         """
         Return the formatted data as a unicode string
         """
@@ -446,7 +479,7 @@ def _format_data(self):
             last = formatter(self[-1])
             summary = '[%s, %s]' % (first, last)
         elif n > max_seq_items:
-            n = min(max_seq_items//2,2)
+            n = min(max_seq_items//2,5)
             head = ', '.join([ formatter(x) for x in self[:n] ])
             tail = ', '.join([ formatter(x) for x in self[-n:] ])
             summary = '[%s, ..., %s]' % (head, tail)
@@ -2874,32 +2907,19 @@ def equals(self, other):
 
         return False
 
-    def __unicode__(self):
+    def _format_attrs(self):
         """
-        Return a string representation for this object.
-
-        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
-        py2/py3.
+        Return a list of tuples of the (attr,formatted_value)
         """
-
-        # currently doesn't use the display.max_categories, or display.max_seq_len
-        # for head/tail printing
-        values = default_pprint(self.values.get_values())
-        cats = default_pprint(self.categories.get_values())
-        space = ' ' * (len(self.__class__.__name__) + 1)
-        name = self.name
-        if name is not None:
-            name = default_pprint(name)
-
-        result = u("{klass}({values},\n{space}categories={categories},\n{space}ordered={ordered},\n{space}name={name})").format(
-            klass=self.__class__.__name__,
-            values=values,
-            categories=cats,
-            ordered=self.ordered,
-            name=name,
-            space=space)
-
-        return result
+        attrs = [('categories', default_pprint(self.categories)),
+                 ('ordered',self.ordered)]
+        if self.name is not None:
+            attrs.append(('name',default_pprint(self.name)))
+        attrs.append(('dtype',"'%s'" % self.dtype))
+        max_seq_items = get_option('display.max_seq_items')
+        if len(self) > max_seq_items:
+            attrs.append(('length',len(self)))
+        return attrs
 
     @property
     def inferred_type(self):
@@ -3955,8 +3975,8 @@ def _format_attrs(self):
         """
         Return a list of tuples of the (attr,formatted_value)
         """
-        attrs = [('levels', default_pprint(self.levels)),
-                 ('labels', default_pprint(self.labels))]
+        attrs = [('levels', default_pprint(self._levels, max_seq_items=False)),
+                 ('labels', default_pprint(self._labels, max_seq_items=False))]
         if not all(name is None for name in self.names):
             attrs.append(('names', default_pprint(self.names)))
         if self.sortorder is not None:
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1711,7 +1711,7 @@ def test_get_indexer(self):
         self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='backfill'))
         self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='nearest'))
 
-    def test_repr(self):
+    def test_repr_roundtrip(self):
 
         ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True)
         str(ci)
@@ -1724,9 +1724,12 @@ def test_repr(self):
             compat.text_type(ci)
 
         # long format
+        # this is not reprable
         ci = CategoricalIndex(np.random.randint(0,5,size=100))
-        result = str(ci)
-        tm.assert_index_equal(eval(repr(ci)),ci,exact=True)
+        if compat.PY3:
+            str(ci)
+        else:
+            compat.text_type(ci)
 
     def test_isin(self):
 
@@ -4417,6 +4420,23 @@ def test_repr_with_unicode_data(self):
             index = pd.DataFrame(d).set_index(["a", "b"]).index
             self.assertFalse("\\u" in repr(index))  # we don't want unicode-escaped
 
+    def test_repr_roundtrip(self):
+
+        mi = MultiIndex.from_product([list('ab'),range(3)],names=['first','second'])
+        str(mi)
+        tm.assert_index_equal(eval(repr(mi)),mi,exact=True)
+
+        # formatting
+        if compat.PY3:
+            str(mi)
+        else:
+            compat.text_type(mi)
+
+        # long format
+        mi = MultiIndex.from_product([list('abcdefg'),range(10)],names=['first','second'])
+        result = str(mi)
+        tm.assert_index_equal(eval(repr(mi)),mi,exact=True)
+
     def test_str(self):
         # tested elsewhere
         pass
diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
@@ -17,6 +17,7 @@
     infer_freq, to_offset, get_period_alias,
     Resolution)
 import pandas.algos as _algos
+from pandas.core.config import get_option
 
 class DatetimeIndexOpsMixin(object):
     """ common ops mixin to support a unified inteface datetimelike Index """
@@ -79,9 +80,9 @@ def freqstr(self):
 
     @cache_readonly
     def inferred_freq(self):
-        """ 
-        Trys to return a string representing a frequency guess, 
-        generated by infer_freq.  Returns None if it can't autodetect the 
+        """
+        Trys to return a string representing a frequency guess,
+        generated by infer_freq.  Returns None if it can't autodetect the
         frequency.
         """
         try:
diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py
@@ -123,13 +123,20 @@ def test_representation(self):
 
         exp2 = """DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D', tz=None)"""
 
-        exp3 = """DatetimeIndex(['2011-01-01', '2011-01-02'], dtype='datetime64[ns]', freq='D', tz=None)"""
+        exp3 = """DatetimeIndex(['2011-01-01'
+               '2011-01-02'], dtype='datetime64[ns]', freq='D', tz=None)"""
 
-        exp4 = """DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq='D', tz=None)"""
+        exp4 = """DatetimeIndex(['2011-01-01',
+               '2011-01-02',
+               '2011-01-03'], dtype='datetime64[ns]', freq='D', tz=None)"""
 
-        exp5 = """DatetimeIndex(['2011-01-01 09:00:00+09:00', '2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00'], dtype='datetime64[ns]', freq='H', tz='Asia/Tokyo')"""
+        exp5 = """DatetimeIndex(['2011-01-01 09:00:00+09:00',
+               '2011-01-01 10:00:00+09:00',
+               '2011-01-01 11:00:00+09:00'], dtype='datetime64[ns]', freq='H', tz='Asia/Tokyo')"""
 
-        exp6 = """DatetimeIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', 'NaT'], dtype='datetime64[ns]', freq=None, tz='US/Eastern')"""
+        exp6 = """DatetimeIndex(['2011-01-01 09:00:00-05:00',
+               '2011-01-01 10:00:00-05:00',
+               'NaT'], dtype='datetime64[ns]', freq=None, tz='US/Eastern')"""
 
         for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6],
                                  [exp1, exp2, exp3, exp4, exp5, exp6]):
@@ -370,11 +377,16 @@ def test_representation(self):
 
         exp2 = """TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', freq='D')"""
 
-        exp3 = """TimedeltaIndex(['1 days', '2 days'], dtype='timedelta64[ns]', freq='D')"""
+        exp3 = """TimedeltaIndex(['1 days'
+                '2 days'], dtype='timedelta64[ns]', freq='D')"""
 
-        exp4 = """TimedeltaIndex(['1 days', '2 days', '3 days'], dtype='timedelta64[ns]', freq='D')"""
+        exp4 = """TimedeltaIndex(['1 days',
+                '2 days',
+                '3 days'], dtype='timedelta64[ns]', freq='D')"""
 
-        exp5 = """TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', '3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)"""
+        exp5 = """TimedeltaIndex(['1 days 00:00:01',
+                '2 days 00:00:00',
+                '3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)"""
 
         for idx, expected in zip([idx1, idx2, idx3, idx4, idx5],
                                  [exp1, exp2, exp3, exp4, exp5]):
@@ -834,19 +846,29 @@ def test_representation(self):
 
         exp2 = """PeriodIndex(['2011-01-01'], dtype='int64', freq='D')"""
 
-        exp3 = """PeriodIndex(['2011-01-01', '2011-01-02'], dtype='int64', freq='D')"""
+        exp3 = """PeriodIndex(['2011-01-01'
+             '2011-01-02'], dtype='int64', freq='D')"""
 
-        exp4 = """PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='int64', freq='D')"""
+        exp4 = """PeriodIndex(['2011-01-01',
+             '2011-01-02',
+             '2011-01-03'], dtype='int64', freq='D')"""
 
-        exp5 = """PeriodIndex(['2011', '2012', '2013'], dtype='int64', freq='A-DEC')"""
+        exp5 = """PeriodIndex(['2011',
+             '2012',
+             '2013'], dtype='int64', freq='A-DEC')"""
 
-        exp6 = """PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], dtype='int64', freq='H')"""
+        exp6 = """PeriodIndex(['2011-01-01 09:00',
+             '2012-02-01 10:00',
+             'NaT'], dtype='int64', freq='H')"""
 
         exp7 = """PeriodIndex(['2013Q1'], dtype='int64', freq='Q-DEC')"""
 
-        exp8 = """PeriodIndex(['2013Q1', '2013Q2'], dtype='int64', freq='Q-DEC')"""
+        exp8 = """PeriodIndex(['2013Q1'
+             '2013Q2'], dtype='int64', freq='Q-DEC')"""
 
-        exp9 = """PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], dtype='int64', freq='Q-DEC')"""
+        exp9 = """PeriodIndex(['2013Q1',
+             '2013Q2',
+             '2013Q3'], dtype='int64', freq='Q-DEC')"""
 
         for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9],
                                  [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]):