diff --git a/pandas/core/encoding.py b/pandas/core/encoding.py new file mode 100644 index 0000000000000..50a732d3d081d --- /dev/null +++ b/pandas/core/encoding.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Mutable sequences handling? specificaly tuples. +# generators - via wrapper? + +import unittest + +from pandas.util import py3compat +from pandas.core.common import PandasError +import numpy as np +import sys + + +try: + next +except NameError: # pragma: no cover + # Python < 2.6 + def next(x): + return x.next() + +# this should live in some package-wide conf object +input_encoding='utf-8' +perform_conversion=True +guess_enc_on_decode_failure=True +guess_enc_min_char_count=100 +guess_enc_max_iter=5000 +guess_enc_min_confidence=0.8 + +def set_input_encoding(encoding): + global input_encoding + input_encoding=encoding + +def _should_process(obj): + """ + A predicate function which determines whether obj should + be processed for byte-string conversion based on it's type. + + Parameters + ---------- + obj - any object + + Returns + ------- + bool - True if the object should be processed + """ + + # pd.Index* are isinstance(np.ndarray) but should be excluded + # because their constructors call decode directly. + # + return perform_conversion and \ + ( isinstance(obj,(list,dict)) or \ + type(obj) == np.ndarray or \ + type(obj) == np.void ) + +def _can_import(name): + """ + Returns True if the named module/package can be imported" + + Parameters + ---------- + `name` - package / module name. + + Returns + ------- + bool - True if `name` can be imported. + + """ + try: + __import__(name) + return True + except ImportError: + return False + +def _decode_obj(obj, encoding): + """ + Recieves an object, and decodes any non-ascii byte-strings found + to unicode using the given encoding. + + You should use `decode_catch_errors` to get friendly error messages + when decoding fails. + + supports str,unicode, and mutable sequences + This function iterates over `seq`, decoding any byte-string object found using the + given `encoding`. + + supports str/unicode and mutable sequences as input, all others + are returned as-is (including generators, for now) + + Handles arbitrarily nested sequences. + + Parameters + ---------- + `obj` - any object. + + Returns + ------- + result - seq with all non-ascii bytestring decoded into utf-8 + + Raises + ------ + UnicodeDecodeError - if decoding with the given encoding fails + """ + + import types + def _dec_str(s,encoding=encoding): + try: + s.encode('ascii') # if it's ascii leave it alone + except UnicodeDecodeError: + s = s.decode(encoding) # if not, convert to unicode + # might raise another UnicodeDecodeError - handled by the caller + return s + + def _dec_seq(seq): + if isinstance(seq, dict): + for k in seq.keys(): # grab the list of keys before we do mutation + v=seq[k] + if isinstance(k, str): + k = _dec_str(k) + elif _should_process(k): # keys are immutable, need this? + k = (yield _dec_seq(k)) + + if isinstance(v, str): + v = _dec_str(v) + elif _should_process(v): + v = (yield _dec_seq(v)) + + seq.pop(k) # need this + seq[k] = v + + else: + for i,e in enumerate(seq): + if isinstance(e, str): + seq[i] = _dec_str(e) + elif _should_process(e): + (yield _dec_seq(e)) + + yield seq + + if py3compat.PY3: + return obj + + if isinstance(obj, basestring): # strings are simple + if isinstance(obj, str): + obj=_dec_str(obj) + return obj + + if not _should_process(obj): # misc. objects are too + return obj + + s = [_dec_seq(obj)] + values = [] + while True: # others - not so much, let's see what we can do. + g = s.pop() + if values: + e = g.send(values.pop()) + else: + e = next(g) + if type(e) == types.GeneratorType: + s.extend([g, e]) + else: + if s: + values.append(e) + else: + return e + +def _extract_txt_from_obj(obj,max_iter=sys.maxint): + """ + a generator which walks `obj`, yielding any byte-string found + + will stop after at most `max_iter` iterations. + + Parameters + ---------- + `obj` - any iterable + + Yields + ------- + byte-strings. + + Raises + ------ + StopIteration - when there are no more byte-strings in the sequence + + """ + + if obj is None or isinstance(obj,basestring): + if isinstance(obj,unicode): + return + yield obj + return + + s = [iter(obj)] + cnt=0 + while s: + g = s.pop() + for e in g: + cnt+=1 + if isinstance(e, str): + yield e + elif isinstance(e, dict): + s.extend([g, e.iterkeys(), e.itervalues()]) + elif _should_process(e): + s.append(g, iter(e)) + + if cnt >= max_iter: + return + +def _detect_encoding(obj,min_cnt=guess_enc_min_char_count,max_iter=guess_enc_max_iter): + """ + extracts byte-string from obj via `_extract_txt_from_obj` and uses + the `chardet` package to detect the encoding used. + + Can handle nested sequences, also looks at dict keys and values. + + Parameters + ---------- + `obj` - input string or sequence + + `min_cnt` - specifies the minimum amount of characters which must be fed to to + the detector before we allow a decision. + + `max_iter` - an upper bound on the number of elements examined in the sequence + looking for text. + This guards against the corner-case of a huge list with a decoding error only + near it's end. + + Returns + ------- + `result` - {'encoding': str, 'confidence': float}, or + {} if no encoding was found. + """ + if not _can_import("chardet"): + return {} + + from chardet.universaldetector import UniversalDetector + detector = UniversalDetector() + cnt = 0 # keep track of number of characters processed + for txt in _extract_txt_from_obj(obj,max_iter): + cnt += len(txt) + detector.feed(txt) + if (cnt > min_cnt and detector.done) : + break + detector.close() + res=detector.result + if res and res['confidence'] > guess_enc_min_confidence\ + and cnt > min_cnt: + return detector.result + else: + return {} + +def decode_catch_errors(obj, encoding=None): + """ + Delegates to `_decode_obj` in order to convert byte-string within obj into + unicode when necessary. If a decode error occurs, prints a user friendly + error message, and if the chardet library is available will try to give + the user a good guesss about the encoding used by extracting text from `obj` + + Parameters + ---------- + `obj` - anything + encoding - an acceptable encoding to be passed to str.decode() + + Returns + ------- + `result` - `obj` with byte-strings decoded into unicode strings + + Raises + ------ + `PandasError(msg)` - with msg being a friendly error message to the user + """ + + try: + encoding = encoding or input_encoding + return _decode_obj(obj, encoding) + except UnicodeDecodeError: + from textwrap import dedent + msg = \ + """ + The input Data contains strings that cannot be decoded with `%s`. + You should specify a correct encoding to the object constructor, + or set the value of the default input encoding in XXX. + """ + + s = dedent(msg) % encoding + if guess_enc_on_decode_failure: + if not _can_import("chardet"): + s += 'The "chardet" package is not installed - ' +\ + "can't suggest an encoding." + else: + det_enc=_detect_encoding(obj) + if det_enc: + conf = det_enc['confidence'] + enc = det_enc['encoding'] + s += 'You might try "%s" as the encoding (Confidence: %2.1f)'\ + % (enc, conf) + + raise PandasError(s) diff --git a/pandas/core/format.py b/pandas/core/format.py index 7125feeeb3b1c..4f6f3a16c8441 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -237,23 +237,12 @@ def _to_str_columns(self, force_unicode=False): if not py3compat.PY3: if force_unicode: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) + strcols = map(lambda col: map(unicode, col), strcols) else: - # generally everything is plain strings, which has ascii - # encoding. problem is when there is a char with value over 127 - # - everything then gets converted to unicode. try: map(lambda col: map(str, col), strcols) except UnicodeError: - def make_unicode(x): - if isinstance(x, unicode): - return x - return x.decode('utf-8') - strcols = map(lambda col: map(make_unicode, col), strcols) + strcols = map(lambda col: map(unicode, col), strcols) return strcols @@ -1121,6 +1110,8 @@ def reset(self): def _put_lines(buf, lines): + # handles #891 where ascii and unicode fields are mixed + # but will fail if encoded bytesting +unicode fields are mixed if any(isinstance(x, unicode) for x in lines): lines = [unicode(x) for x in lines] buf.write('\n'.join(lines)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e20aba116ef04..6d4f0ca73be11 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -37,6 +37,7 @@ from pandas.util.decorators import deprecate, Appender, Substitution from pandas.tseries.period import PeriodIndex +import pandas.core.encoding as en import pandas.core.algorithms as algos import pandas.core.datetools as datetools @@ -366,6 +367,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if data is None: data = {} + columns= en.decode_catch_errors(columns) + index= en.decode_catch_errors(index) + data= en.decode_catch_errors(data) + if isinstance(data, DataFrame): data = data._data @@ -950,7 +955,7 @@ def from_items(cls, items, columns=None, orient='columns'): @classmethod def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None): + parse_dates=True, encoding='utf-8'): """ Read delimited file into DataFrame @@ -1675,6 +1680,8 @@ def iget_value(self, i, j): def __getitem__(self, key): # slice rows + key=en.decode_catch_errors(key) + if isinstance(key, slice): from pandas.core.indexing import _is_index_slice idx_type = self.index.inferred_type @@ -1793,6 +1800,9 @@ def __setattr__(self, name, value): def __setitem__(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 + value=en.decode_catch_errors(value) + key=en.decode_catch_errors(key) + if isinstance(key, DataFrame): if not (key.index.equals(self.index) and key.columns.equals(self.columns)): @@ -1972,6 +1982,9 @@ def xs(self, key, axis=0, level=None, copy=True): ------- xs : Series or DataFrame """ + + key=en.decode_catch_errors(key) + labels = self._get_axis(axis) if level is not None: loc, new_ax = labels.get_loc_level(key, level=level) diff --git a/pandas/core/index.py b/pandas/core/index.py index 08d1c593d42ca..01854f5475183 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -5,8 +5,8 @@ from itertools import izip import numpy as np - -from pandas.core.common import ndtake +import pandas.core.encoding as en +from pandas.core.common import ndtake,_is_sequence from pandas.util.decorators import cache_readonly import pandas.core.common as com import pandas.lib as lib @@ -80,6 +80,8 @@ class Index(np.ndarray): _engine_type = lib.ObjectEngine def __new__(cls, data, dtype=None, copy=False, name=None): + data= en.decode_catch_errors(data) + if isinstance(data, np.ndarray): if issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex @@ -305,12 +307,15 @@ def __contains__(self, key): def __hash__(self): return hash(self.view(np.ndarray)) - def __setitem__(self, key, value): - """Disable the setting of values.""" - raise Exception(str(self.__class__) + ' object is immutable') + def __getattribute__(self,name): + if name=="__setitem__": # emulate an Immutable ndarray + raise AttributeError(str(self.__class__) + ' object is immutable') + else: + return object.__getattribute__(self,name) def __getitem__(self, key): """Override numpy.ndarray's __getitem__ method to work as desired""" + key=en.decode_catch_errors(key) arr_idx = self.view(np.ndarray) if np.isscalar(key): return arr_idx[key] @@ -1163,6 +1168,8 @@ class Int64Index(Index): _engine_type = lib.Int64Engine def __new__(cls, data, dtype=None, copy=False, name=None): + data= en.decode_catch_errors(data) + if not isinstance(data, np.ndarray): if np.isscalar(data): raise ValueError('Index(...) must be called with a collection ' @@ -1252,6 +1259,10 @@ class MultiIndex(Index): names = None def __new__(cls, levels=None, labels=None, sortorder=None, names=None): + levels= en.decode_catch_errors(levels) + labels= en.decode_catch_errors(labels) + names= en.decode_catch_errors(names) + assert(len(levels) == len(labels)) if len(levels) == 0: raise Exception('Must pass non-zero number of levels/labels') @@ -1634,6 +1645,7 @@ def __setstate__(self, state): self.sortorder = sortorder def __getitem__(self, key): + key=en.decode_catch_errors(key) if np.isscalar(key): return tuple(lev[lab[key]] for lev, lab in zip(self.levels, self.labels)) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 211434ab07154..dd6d5351fff38 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -21,7 +21,7 @@ import pandas.core.common as com import pandas.core.nanops as nanops import pandas.lib as lib - +import pandas.core.encoding as en def _ensure_like_indices(time, panels): """ @@ -76,6 +76,10 @@ def panel_index(time, panels, names=['time', 'panel']): (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), (1962, 'C')], dtype=object) """ + + panels= en.decode_catch_errors(panels) + names= en.decode_catch_errors(names) + time, panels = _ensure_like_indices(time, panels) time_factor = Factor.from_array(time) panel_factor = Factor.from_array(panels) @@ -208,6 +212,9 @@ def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, if data is None: data = {} + data = en.decode_catch_errors(data) + items = en.decode_catch_errors(items) + passed_axes = [items, major_axis, minor_axis] axes = None if isinstance(data, BlockManager): @@ -566,6 +573,9 @@ def _box_item_values(self, key, values): def __getattr__(self, name): """After regular attribute access, try looking up the name of an item. This allows simpler access to items for interactive use.""" + + name = en.decode_catch_errors(name) + if name in self.items: return self[name] raise AttributeError("'%s' object has no attribute '%s'" % @@ -577,6 +587,11 @@ def _slice(self, slobj, axis=0): def __setitem__(self, key, value): _, N, K = self.shape + + + key = en.decode_catch_errors(key) + value = en.decode_catch_errors(value) + if isinstance(value, DataFrame): value = value.reindex(index=self.major_axis, columns=self.minor_axis) @@ -1463,4 +1478,3 @@ def complete_dataframe(obj, prev_completions): install_ipython_completers() except Exception: pass - diff --git a/pandas/core/series.py b/pandas/core/series.py index eca177c4c543b..c16ebe7e74d2e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -34,6 +34,8 @@ from pandas.compat.scipy import scoreatpercentile as _quantile +import pandas.core.encoding as en + __all__ = ['Series', 'TimeSeries'] _np_version = np.version.short_version @@ -300,6 +302,10 @@ def __new__(cls, data=None, index=None, dtype=None, name=None, if data is None: data = {} + index= en.decode_catch_errors(index) + data= en.decode_catch_errors(data) + name= en.decode_catch_errors(name) + if index is not None: index = _ensure_index(index) @@ -460,6 +466,7 @@ def ix(self): return self._ix def __getitem__(self, key): + key=en.decode_catch_errors(key) try: return self.index.get_value(self, key) except InvalidIndexError: @@ -557,6 +564,9 @@ def _get_values(self, indexer): return self.values[indexer] def __setitem__(self, key, value): + value=en.decode_catch_errors(value) + key=en.decode_catch_errors(key) + try: try: self.index._engine.set_value(self, key, value) @@ -1153,7 +1163,7 @@ def max(self, axis=None, out=None, skipna=True, level=None): @Substitution(name='standard deviation', shortname='stdev', na_action=_doc_exclude_na, extras='') - @Appender(_stat_doc + + @Appender(_stat_doc + """ Normalized by N-1 (unbiased estimator). """) @@ -1166,7 +1176,7 @@ def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True, @Substitution(name='variance', shortname='var', na_action=_doc_exclude_na, extras='') - @Appender(_stat_doc + + @Appender(_stat_doc + """ Normalized by N-1 (unbiased estimator). """) diff --git a/pandas/tests/test_encoding.py b/pandas/tests/test_encoding.py new file mode 100644 index 0000000000000..89e4612f59c27 --- /dev/null +++ b/pandas/tests/test_encoding.py @@ -0,0 +1,148 @@ +import unittest +import nose +import pandas.core.encoding as en +from pandas.util import py3compat + +try: + next +except NameError: # pragma: no cover + # Python < 2.6 + def next(x): + return x.next() + +class TestEncoding(unittest.TestCase): + def setUp(self): + self.u = u"\u03c3" + self.u_e = self.u.encode('utf-8') + self.seq =self.assertEqual + + def tearDown(self): + pass + + def test_decode(self): + if py3compat.PY3: + raise nose.SkipTest() + + self.seq(en.decode_catch_errors([]),[]) + self.seq(en.decode_catch_errors([1,2.5,True]),[1,2.5,True]) + self.seq(en.decode_catch_errors([u"1","2",3]), [u'1','2', 3]) + self.seq(en.decode_catch_errors([self.u,2,3]),[self.u, 2, 3]) + self.seq(en.decode_catch_errors([self.u_e,"a"]),[self.u, 'a']) + + def test_decode_with_nested(self): + if py3compat.PY3: + raise nose.SkipTest() + + self.seq(en.decode_catch_errors([self.u_e,["a"],u"a"]),[self.u, ['a'], u'a']) # ascii left alone + + + def test_decode_with_immutable_seq(self): + if py3compat.PY3: + raise nose.SkipTest() + + # mutables are not altered + self.assertTrue(en.decode_catch_errors((self.u_e,))==(self.u_e,)) + self.assertTrue(en.decode_catch_errors(["abc",(u"abc",)])==["abc", (u"abc",)]) # mutables not converted + + def test_decode_with_nested_and_dicts(self): + if py3compat.PY3: + raise nose.SkipTest() + + self.seq(en.decode_catch_errors({"a":"b"}), {u'a': u'b'}) + + r=[u'a',u'b', 1, 2.5, True, {u'a': u'b'}, + [u'a', u'b', 1, 2.5, True, {u'a': u'b'}, + [u'a', u'b', 1, 2.5, True, {u'a': u'b'}], + [u'a', u'b', 1, 2.5, True, {u'a': u'b'}]]] + + self.seq(en.decode_catch_errors(["a",u"b",1,2.5,True,{"a":"b"}, + ["a",u"b",1,2.5,True,{"a":"b"}, + ["a",u"b",1,2.5,True,{"a":"b"}], + ["a",u"b",1,2.5,True,{"a":"b"}]]]),r) + + r= [{"k": [self.u, [self.u,1], u'b']}] + self.seq(en.decode_catch_errors([{"k":[self.u_e,[self.u_e,1],u"b"]}]),r) + + def test_decode_non_seq(self): + self.seq(en.decode_catch_errors("abcd"),"abcd") + self.seq(en.decode_catch_errors(u"abcd"),u"abcd") + + def test_extract_text(self): + if py3compat.PY3: + raise nose.SkipTest() + + # test with self.seq, pure str, pure unicode + g=en._extract_txt_from_obj(u"abcd") + + try: + next(g) + except StopIteration: + pass + else: + self.fail("erroneous yield") + # self.assertRaises(StopIteration,next(g)) + + g=en._extract_txt_from_obj("abcd") + self.seq(next(g),"abcd") + + g=en._extract_txt_from_obj("\xcc") + self.seq(next(g),"\xcc") + + g=en._extract_txt_from_obj(["abcd","\xcc"]) + self.seq(next(g),"abcd") + self.seq(next(g),"\xcc") + + def test_recursion_limit_safe(self): + "Test against recursion limit" + import sys + + a=["a"] + for i in range(sys.getrecursionlimit()+1): + a=["a",a] + + try: + en.decode_catch_errors(a) + except RuntimeError: + self.fail("en.decode_self.seq() Implementation cannot handle deeply-nested self.sequences") + + def test_ordered_dict_key_ordering(self): + "Test That OrderedDicts keep their key ordering" + import string,random,sys + + if sys.version_info[:2]<(2,7): + raise nose.SkipTest + + from collections import OrderedDict + self.seq=self.assertEqual + + for i in range(100): + keys=[string.ascii_letters[random.randint(1,20)] for x in range(20)] + d=OrderedDict.fromkeys(keys) + # after decoding, is the order of keys is maintained? + self.seq( en.decode_catch_errors([d])[0].keys(),map(unicode,d.keys())) + + def test_detect_text_enc(self): + import string + if en._can_import("chardet"): + res=en._detect_encoding(string.ascii_letters,min_cnt=10) + self.assertTrue(isinstance(res,dict)) + self.assertTrue('confidence' in res and 'encoding' in res) # keys in result dict + res=en._detect_encoding("a") # not enough confidence, return empty + self.assertTrue(res=={}) + + def test_detector_detects_enc(self): + s='\xf9\xec\xe5\xed \xf8\xe1 \xf9\xe5\xe1\xea'+\ + '\xf6\xe9\xf4\xe5\xf8\xe4 \xf0\xe7\xee\xe3\xfa' + + if en._can_import("chardet"): + res=en._detect_encoding(s,min_cnt=0) + self.assertTrue(isinstance(res,dict)) + self.assertTrue('confidence' in res and 'encoding' in res) # keys in result dict + self.assertEqual(res['encoding'],"windows-1255") # keys in result dict + + + def test_text_extract_limit_iter(self): + if en._can_import("chardet"): + seq=["a","a","b"] + for x in en._extract_txt_from_obj(seq,2): + self.assertNotEqual(x,"b") diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 4ea62d695042a..c8182feb436f6 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -143,11 +143,6 @@ def test_to_string_unicode_two(self): buf = StringIO() dm.to_string(buf) - def test_to_string_unicode_three(self): - dm = DataFrame(['\xc2']) - buf = StringIO() - dm.to_string(buf) - def test_to_string_with_formatters(self): df = DataFrame({'int': [1, 2, 3], 'float': [1.0, 2.0, 3.0], @@ -709,7 +704,7 @@ def test_to_html_with_classes(self):
Index([], dtype=object) | +Index((), dtype=object) | Empty DataFrame |