pandas-dev · ghost · Oct 15, 2012 · Oct 16, 2012 · Oct 18, 2012 · Oct 17, 2012
diff --git a/pandas/core/encoding.py b/pandas/core/encoding.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Mutable sequences handling? specificaly tuples.
+# generators - via wrapper?
+
+import unittest
+
+from pandas.util import py3compat
+from pandas.core.common import  PandasError
+import numpy as np
+import sys
+
+
+try:
+    next
+except NameError:  # pragma: no cover
+    # Python < 2.6
+    def next(x):
+        return x.next()
+
+# this should live in some package-wide conf object
+input_encoding='utf-8'
+perform_conversion=True
+guess_enc_on_decode_failure=True
+guess_enc_min_char_count=100
+guess_enc_max_iter=5000
+guess_enc_min_confidence=0.8
+
+def set_input_encoding(encoding):
+    global input_encoding
+    input_encoding=encoding
+
+def _should_process(obj):
+    """
+    A predicate function which determines whether obj should
+    be processed for byte-string conversion based on it's type.
+
+    Parameters
+    ----------
+    obj - any object
+
+    Returns
+    -------
+    bool - True if the object should be processed
+    """
+
+    # pd.Index* are isinstance(np.ndarray) but should be excluded
+    # because their constructors call decode directly.
+    #
+    return perform_conversion and \
+      ( isinstance(obj,(list,dict)) or \
+        type(obj) == np.ndarray or \
+        type(obj) == np.void )
+
+def _can_import(name):
+    """
+    Returns True if the named module/package can be imported"
+
+    Parameters
+    ----------
+    `name` -  package / module name.
+
+    Returns
+    -------
+    bool - True if `name` can be imported.
+
+    """
+    try:
+        __import__(name)
+        return True
+    except ImportError:
+        return False
+
+def _decode_obj(obj, encoding):
+    """
+    Recieves an object, and decodes any non-ascii byte-strings found
+    to unicode using the given encoding.
+
+    You should use `decode_catch_errors` to get friendly error messages
+    when decoding fails.
+
+    supports str,unicode, and mutable sequences
+    This function iterates over `seq`, decoding any byte-string object found using the
+    given `encoding`.
+
+    supports str/unicode and mutable sequences as input, all others
+    are returned as-is (including generators, for now)
+
+    Handles arbitrarily nested sequences.
+
+    Parameters
+    ----------
+    `obj` - any object.
+
+    Returns
+    -------
+    result -  seq with all non-ascii bytestring decoded into utf-8
+
+    Raises
+    ------
+    UnicodeDecodeError - if decoding with the given encoding fails
+    """
+
+    import types
+    def _dec_str(s,encoding=encoding):
+        try:
+            s.encode('ascii') # if it's ascii leave it alone
+        except UnicodeDecodeError:
+            s = s.decode(encoding) # if not, convert to unicode
+            # might raise another UnicodeDecodeError - handled by the caller
+        return s
+
+    def _dec_seq(seq):
+        if isinstance(seq, dict):
+           for k in seq.keys():  # grab the list of keys before we do mutation
+               v=seq[k]
+               if isinstance(k, str):
+                   k = _dec_str(k)
+               elif _should_process(k): # keys are immutable, need this?
+                   k = (yield _dec_seq(k))
+
+               if isinstance(v, str):
+                   v = _dec_str(v)
+               elif _should_process(v):
+                   v = (yield _dec_seq(v))
+
+               seq.pop(k) # need this
+               seq[k] = v
+
+        else:
+            for i,e in enumerate(seq):
+                if isinstance(e, str):
+                    seq[i] = _dec_str(e)
+                elif _should_process(e):
+                    (yield _dec_seq(e))
+
+        yield seq
+
+    if py3compat.PY3:
+        return obj
+
+    if isinstance(obj, basestring): # strings are simple
+        if isinstance(obj, str):
+            obj=_dec_str(obj)
+        return obj
+
+    if not _should_process(obj): # misc. objects are too
+        return obj
+
+    s = [_dec_seq(obj)]
+    values = []
+    while True: # others - not so much, let's see what we can do.
+        g = s.pop()
+        if values:
+            e = g.send(values.pop())
+        else:
+            e = next(g)
+        if type(e) == types.GeneratorType:
+            s.extend([g, e])
+        else:
+            if s:
+                values.append(e)
+            else:
+                return e
+
+def _extract_txt_from_obj(obj,max_iter=sys.maxint):
+    """
+    a generator which walks `obj`, yielding any byte-string found
+
+    will stop after at most `max_iter` iterations.
+
+    Parameters
+    ----------
+    `obj` - any iterable
+
+    Yields
+    -------
+    byte-strings.
+
+    Raises
+    ------
+    StopIteration - when there are no more byte-strings in the sequence
+
+    """
+
+    if obj is None or isinstance(obj,basestring):
+        if isinstance(obj,unicode):
+            return
+        yield obj
+        return
+
+    s = [iter(obj)]
+    cnt=0
+    while s:
+        g = s.pop()
+        for e in g:
+            cnt+=1
+            if isinstance(e, str):
+                yield e
+            elif isinstance(e, dict):
+                s.extend([g, e.iterkeys(), e.itervalues()])
+            elif _should_process(e):
+                s.append(g, iter(e))
+
+            if cnt >= max_iter:
+                return
+
+def _detect_encoding(obj,min_cnt=guess_enc_min_char_count,max_iter=guess_enc_max_iter):
+    """
+    extracts byte-string from obj via `_extract_txt_from_obj` and uses
+    the `chardet` package to detect the encoding used.
+
+    Can handle nested sequences, also looks at dict keys and values.
+
+    Parameters
+    ----------
+    `obj` - input string or sequence
+
+    `min_cnt` - specifies the minimum amount of characters which must be fed to to
+    the detector before we allow a decision.
+
+    `max_iter` - an upper bound on the number of elements examined in the sequence
+    looking for text.
+    This guards against the corner-case of a huge list with a decoding error only
+    near it's end.
+
+    Returns
+    -------
+    `result` - {'encoding': str, 'confidence': float}, or
+             {} if no encoding was found.
+    """
+    if not _can_import("chardet"):
+        return {}
+
+    from chardet.universaldetector import UniversalDetector
+    detector = UniversalDetector()
+    cnt = 0 # keep track of number of characters processed
+    for txt in _extract_txt_from_obj(obj,max_iter):
+        cnt += len(txt)
+        detector.feed(txt)
+        if (cnt > min_cnt and detector.done) :
+            break
+    detector.close()
+    res=detector.result
+    if res and res['confidence'] > guess_enc_min_confidence\
+      and cnt > min_cnt:
+        return detector.result
+    else:
+        return {}
+
+def decode_catch_errors(obj, encoding=None):
+    """
+    Delegates to `_decode_obj` in order to convert byte-string within obj into
+    unicode when necessary. If a decode error occurs, prints a user friendly
+    error message, and if the chardet library is available will try to give
+    the user a good guesss about the encoding used by extracting text from `obj`
+
+    Parameters
+    ----------
+    `obj` - anything
+    encoding - an acceptable encoding to be passed to str.decode()
+
+    Returns
+    -------
+    `result` - `obj` with byte-strings decoded into unicode strings
+
+    Raises
+    ------
+    `PandasError(msg)` - with msg being a friendly error message to the user
+    """
+
+    try:
+        encoding = encoding or input_encoding
+        return _decode_obj(obj, encoding)
+    except UnicodeDecodeError:
+        from textwrap import dedent
+        msg = \
+            """
+        The input Data contains strings that cannot be decoded with `%s`.
+        You should specify a correct encoding to the object constructor,
+        or set the value of the default input encoding in XXX.
+        """
+
+        s = dedent(msg) % encoding
+        if guess_enc_on_decode_failure:
+            if not _can_import("chardet"):
+                s += 'The "chardet" package is not installed - ' +\
+                     "can't suggest an encoding."
+            else:
+                det_enc=_detect_encoding(obj)
+                if det_enc:
+                    conf = det_enc['confidence']
+                    enc = det_enc['encoding']
+                    s += 'You might try "%s" as the encoding (Confidence: %2.1f)'\
+                  % (enc, conf)
+
+        raise PandasError(s)
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -237,23 +237,12 @@ def _to_str_columns(self, force_unicode=False):
 
         if not py3compat.PY3:
             if force_unicode:
-                def make_unicode(x):
-                    if isinstance(x, unicode):
-                        return x
-                    return x.decode('utf-8')
-                strcols = map(lambda col: map(make_unicode, col), strcols)
+                strcols = map(lambda col: map(unicode, col), strcols)
             else:
-                # generally everything is plain strings, which has ascii
-                # encoding.  problem is when there is a char with value over 127
-                # - everything then gets converted to unicode.
                 try:
                     map(lambda col: map(str, col), strcols)
                 except UnicodeError:
-                    def make_unicode(x):
-                        if isinstance(x, unicode):
-                            return x
-                        return x.decode('utf-8')
-                    strcols = map(lambda col: map(make_unicode, col), strcols)
+                    strcols = map(lambda col: map(unicode, col), strcols)
 
         return strcols
 
@@ -1121,6 +1110,8 @@ def reset(self):
 
 
 def _put_lines(buf, lines):
+    # handles #891 where ascii and unicode fields are mixed
+    # but will fail if encoded bytesting +unicode fields are mixed
     if any(isinstance(x, unicode) for x in lines):
         lines = [unicode(x) for x in lines]
     buf.write('\n'.join(lines))

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -37,6 +37,7 @@
 from pandas.util.decorators import deprecate, Appender, Substitution
 
 from pandas.tseries.period import PeriodIndex
+import pandas.core.encoding as en
 
 import pandas.core.algorithms as algos
 import pandas.core.datetools as datetools
@@ -366,6 +367,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
         if data is None:
             data = {}
 
+        columns= en.decode_catch_errors(columns)
+        index= en.decode_catch_errors(index)
+        data= en.decode_catch_errors(data)
+
         if isinstance(data, DataFrame):
             data = data._data
 
@@ -950,7 +955,7 @@ def from_items(cls, items, columns=None, orient='columns'):
 
     @classmethod
     def from_csv(cls, path, header=0, sep=',', index_col=0,
-                 parse_dates=True, encoding=None):
+                 parse_dates=True, encoding='utf-8'):
         """
         Read delimited file into DataFrame
 
@@ -1675,6 +1680,8 @@ def iget_value(self, i, j):
 
     def __getitem__(self, key):
         # slice rows
+        key=en.decode_catch_errors(key)
+
         if isinstance(key, slice):
             from pandas.core.indexing import _is_index_slice
             idx_type = self.index.inferred_type
@@ -1793,6 +1800,9 @@ def __setattr__(self, name, value):
     def __setitem__(self, key, value):
         # support boolean setting with DataFrame input, e.g.
         # df[df > df2] = 0
+        value=en.decode_catch_errors(value)
+        key=en.decode_catch_errors(key)
+
         if isinstance(key, DataFrame):
             if not (key.index.equals(self.index) and
                     key.columns.equals(self.columns)):
@@ -1972,6 +1982,9 @@ def xs(self, key, axis=0, level=None, copy=True):
         -------
         xs : Series or DataFrame
         """
+
+        key=en.decode_catch_errors(key)
+
         labels = self._get_axis(axis)
         if level is not None:
             loc, new_ax = labels.get_loc_level(key, level=level)