pandas-dev · wesm · Oct 12, 2012 · Oct 1, 2012 · Oct 10, 2012 · Oct 1, 2012
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -721,23 +721,6 @@ def _index_labels_to_array(labels):
 
     return labels
 
-def _stringify(col, encoding='UTF8'):
-    # unicode workaround
-    try:
-        return unicode(col)
-    except UnicodeError:
-        try:
-            if isinstance(col, str):
-                return col.decode(encoding)
-        except UnicodeError:
-            pass
-        return console_encode(col)
-
-def _stringify_seq(values):
-    if any(isinstance(x, unicode) for x in values):
-        return [_stringify(x) for x in values]
-    return [str(x) for x in values]
-
 def _maybe_make_list(obj):
     if obj is not None and not isinstance(obj, (tuple, list)):
         return [obj]
@@ -753,6 +736,9 @@ def is_iterator(obj):
     # python 3 generators have __next__ instead of next
     return hasattr(obj, 'next') or hasattr(obj, '__next__')
 
+def is_number(obj):
+    return isinstance(obj, (np.number, int, long, float))
+
 def is_integer_dtype(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         tipo = arr_or_dtype.type
@@ -778,6 +764,14 @@ def is_float_dtype(arr_or_dtype):
 def is_list_like(arg):
     return hasattr(arg, '__iter__') and not isinstance(arg, basestring)
 
+def _is_sequence(x):
+    try:
+        iter(x)
+        assert(not isinstance(x, basestring))
+        return True
+    except Exception:
+        return False
+
 _ensure_float64 = _algos.ensure_float64
 _ensure_int64 = _algos.ensure_int64
 _ensure_int32 = _algos.ensure_int32
@@ -859,16 +853,6 @@ def load(path):
     finally:
         f.close()
 
-def console_encode(value):
-    if py3compat.PY3 or not isinstance(value, unicode):
-        return value
-
-    try:
-        import sys
-        return value.encode(sys.stdin.encoding or 'utf-8', 'replace')
-    except (AttributeError, TypeError):
-        return value.encode('ascii', 'replace')
-
 class UTF8Recoder:
     """
     Iterator that reads an encoded stream and reencodes the input to UTF-8
@@ -932,10 +916,17 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
             self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
             self.stream = f
             self.encoder = codecs.getincrementalencoder(encoding)()
+            self.quoting=kwds.get("quoting",None)
 
         def writerow(self, row):
-            row = [x if isinstance(x, basestring) else str(x) for x in row]
-            self.writer.writerow([s.encode("utf-8") for s in row])
+            def _check_as_is(x):
+                return (self.quoting == csv.QUOTE_NONNUMERIC and \
+                        is_number(x)) or isinstance(x, str)
+
+            row = [x if _check_as_is(x)
+                   else pprint_thing(x).encode('utf-8') for x in row]
+
+            self.writer.writerow([s for s in row])
             # Fetch UTF-8 output from the queue ...
             data = self.queue.getvalue()
             data = data.decode("utf-8")
@@ -960,3 +951,94 @@ def _concat_compat(to_concat, axis=0):
         return new_values.view(_NS_DTYPE)
     else:
         return np.concatenate(to_concat, axis=axis)
+
+# Unicode consolidation
+# ---------------------
+#
+# pprinting utility functions for generating Unicode text or bytes(3.x)/str(2.x)
+# representations of objects.
+# Try to use these as much as possible rather then rolling your own.
+#
+# When to use
+# -----------
+#
+# 1) If you're writing code internal to pandas (no I/O directly involved),
+#    use pprint_thing().
+#
+#    It will always return unicode text which can handled by other
+#    parts of the package without breakage.
+#
+# 2) If you need to send something to the console, use console_encode().
+#
+#    console_encode() should (hopefully) choose the right encoding for you
+#    based on the encoding set in fmt.print_config.encoding.
+#
+# 3) if you need to write something out to file, use pprint_thing_encoded(encoding).
+#
+#    If no encoding is specified, it defaults to utf-8. SInce encoding pure ascii with
+#    utf-8 is a no-op you can safely use the default utf-8 if you're working with
+#    straight ascii.
+
+def _pprint_seq(seq,_nest_lvl=0):
+    """
+    internal. pprinter for iterables. you should probably use pprint_thing()
+    rather then calling this directly.
+    """
+    fmt=u"[%s]" if hasattr(seq,'__setitem__') else u"(%s)"
+    return fmt % ", ".join(pprint_thing(e,_nest_lvl+1) for e in seq)
+
+def pprint_thing(thing,_nest_lvl=0):
+    """
+    This function is the sanctioned way of converting objects
+    to a unicode representation.
+
+    properly handles nested sequences containing unicode strings
+    (unicode(object) does not)
+
+    Parameters
+    ----------
+    thing : anything to be formatted
+    _nest_lvl : internal use only. pprint_thing() is mutually-recursive
+       with pprint_sequence, this argument is used to keep track of the
+       current nesting level, and limit it.
+
+    Returns
+    -------
+    result - unicode object on py2, str on py3. Always Unicode.
+
+    """
+    from pandas.core.format import print_config
+    if thing is None:
+        result = ''
+    elif _is_sequence(thing) and _nest_lvl < print_config.pprint_nest_depth:
+        result = _pprint_seq(thing,_nest_lvl)
+    else:
+        # when used internally in the package, everything
+        # passed in should be a unicode object or have a unicode
+        # __str__. However as an aid to transition, we also accept
+        # utf8 encoded strings, if that's not it, we have no way
+        # to know, and the user should deal with it himself.
+        # so we resort to utf-8 with replacing errors
+
+        try:
+            result = unicode(thing) # we should try this first
+        except UnicodeDecodeError:
+            # either utf-8 or we replace errors
+            result = str(thing).decode('utf-8',"replace")
+
+    return unicode(result) # always unicode
+
+def pprint_thing_encoded(object,encoding='utf-8',errors='replace'):
+    value=pprint_thing(object) # get unicode representation of object
+    return value.encode(encoding, errors)
+
+def console_encode(object):
+    from pandas.core.format import print_config
+    """
+    this is the sanctioned way to prepare something for
+    sending *to the console*, it delegates to pprint_thing() to get
+    a unicode representation of the object relies on the global encoding
+    set in print_config.encoding. Use this everywhere
+    where you output to the console.
+    """
+    return pprint_thing_encoded(object,print_config.encoding)
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -8,7 +8,7 @@
 except:
     from io import StringIO
 
-from pandas.core.common import adjoin, isnull, notnull, _stringify
+from pandas.core.common import adjoin, isnull, notnull
 from pandas.core.index import MultiIndex, _ensure_index
 from pandas.util import py3compat
 
@@ -72,7 +72,7 @@ def __init__(self, series, buf=None, header=True, length=True,
         self.float_format = float_format
 
     def _get_footer(self):
-        footer = ''
+        footer = u''
 
         if self.name:
             if getattr(self.series.index, 'freq', None):
@@ -81,24 +81,15 @@ def _get_footer(self):
             if footer and self.series.name:
                 footer += ', '
 
-            if self.series.name:
-                if isinstance(self.series.name, basestring):
-                    series_name = self.series.name
-                elif isinstance(self.series.name, tuple):
-                    series_name = "('%s')" % "', '".join(self.series.name)
-                else:
-                    series_name = str(self.series.name)
-            else:
-                series_name = self.series.name
-
-            footer += (("Name: %s" % series_name)
-                       if series_name is not None else '')
+            series_name = com.pprint_thing(self.series.name)
+            footer += ("Name: %s" % series_name) if self.series.name is not None else ""
 
         if self.length:
             if footer:
                 footer += ', '
             footer += 'Length: %d' % len(self.series)
-        return footer
+
+        return unicode(footer)
 
     def _get_formatted_index(self):
         index = self.series.index
@@ -143,7 +134,9 @@ def to_string(self):
         if footer:
             result.append(footer)
 
-        return '\n'.join(result)
+        if py3compat.PY3:
+            return unicode(u'\n'.join(result))
+        return com.console_encode(u'\n'.join(result))
 
 if py3compat.PY3:  # pragma: no cover
     _encode_diff = lambda x: 0
@@ -724,12 +717,7 @@ def _format_strings(self, use_unicode=False):
         else:
             float_format = self.float_format
 
-        if use_unicode:
-            def _strify(x):
-                return _stringify(x, print_config.encoding)
-            formatter = _strify if self.formatter is None else self.formatter
-        else:
-            formatter = str if self.formatter is None else self.formatter
+        formatter = com.pprint_thing if self.formatter is None else self.formatter
 
         def _format(x):
             if self.na_rep is not None and lib.checknull(x):
@@ -1098,10 +1086,33 @@ def __init__(self):
         self.notebook_repr_html = True
         self.date_dayfirst = False
         self.date_yearfirst = False
+        self.pprint_nest_depth = 3
         self.multi_sparse = True
-        self.encoding = sys.getdefaultencoding()
-        if self.encoding == 'ascii':
-            self.encoding = 'UTF8'
+        self.encoding = self.detect_encoding()
+
+    def detect_encoding(self):
+        """
+        Try to find the most capable encoding supported by the console.
+        slighly modified from the way IPython handles the same issue.
+        """
+        import locale
+
+        encoding = None
+        try:
+            encoding=sys.stdin.encoding
+        except AttributeError:
+            pass
+
+        if not encoding or encoding =='ascii': # try again for something better
+            try:
+                encoding = locale.getpreferredencoding()
+            except Exception:
+                pass
+
+        if not encoding: # when all else fails. this will usually be "ascii"
+                encoding = sys.getdefaultencoding()
+
+        return encoding
 
     def reset(self):
         self.__init__()

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -24,8 +24,8 @@
 import numpy as np
 import numpy.ma as ma
 
-from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
-                                _default_index, _stringify)
+from pandas.core.common import (isnull, notnull, PandasError, _try_sort,\
+                                _default_index,_is_sequence)
 from pandas.core.generic import NDFrame
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels
@@ -584,6 +584,9 @@ def __repr__(self):
         else:
             self.to_string(buf=buf)
         value = buf.getvalue()
+
+        if py3compat.PY3:
+            return unicode(value)
         return com.console_encode(value)
 
     def _repr_html_(self):
@@ -1365,7 +1368,7 @@ def info(self, verbose=True, buf=None):
         # hack
         if verbose and len(self.columns) < 100:
             lines.append('Data columns:')
-            space = max([len(_stringify(k)) for k in self.columns]) + 4
+            space = max([len(com.pprint_thing(k)) for k in self.columns]) + 4
             counts = self.count()
             assert(len(cols) == len(counts))
             for col, count in counts.iteritems():
@@ -5106,14 +5109,6 @@ def _homogenize(data, index, columns, dtype=None):
 def _put_str(s, space):
     return ('%s' % s)[:space].ljust(space)
 
-def _is_sequence(x):
-    try:
-        iter(x)
-        assert(not isinstance(x, basestring))
-        return True
-    except Exception:
-        return False
-
 def install_ipython_completers():  # pragma: no cover
     """Register the DataFrame type with IPython's tab completion machinery, so
     that it knows about accessing column names as attributes."""