diff --git a/pandas/core/common.py b/pandas/core/common.py index 3957bca6eff84..85524c7542fbf 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -705,11 +705,16 @@ def _index_labels_to_array(labels): return labels -def _stringify(col): +def _stringify(col, encoding='UTF8'): # unicode workaround try: return unicode(col) except UnicodeError: + try: + if isinstance(col, str): + return col.decode(encoding) + except UnicodeError: + pass return console_encode(col) def _stringify_seq(values): @@ -930,4 +935,3 @@ def _concat_compat(to_concat): return new_values.view(_NS_DTYPE) else: return np.concatenate(to_concat) - diff --git a/pandas/core/format.py b/pandas/core/format.py index c5be148db31e0..65cbd48be8919 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1,4 +1,5 @@ from itertools import izip +import sys try: from StringIO import StringIO @@ -480,15 +481,30 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.justify = justify def get_result(self): - if self._have_unicode(): + if self._conv_unicode(): fmt_values = self._format_strings(use_unicode=True) else: fmt_values = self._format_strings(use_unicode=False) return _make_fixed_width(fmt_values, self.justify) - def _have_unicode(self): - mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode)) + def _conv_unicode(self): + #check if any needs and can be converted to nonascii encoding + def _nonascii(x): + if isinstance(x, unicode): + return True + try: + if isinstance(x, str): + x.decode('ascii') + return False + except UnicodeError: + try: + x.decode(print_config.encoding) + return True + except UnicodeError: + return False + return False + mask = lib.map_infer(self.values, _nonascii) return mask.any() def _format_strings(self, use_unicode=False): @@ -501,7 +517,9 @@ def _format_strings(self, use_unicode=False): float_format = self.float_format if use_unicode: - formatter = _stringify if self.formatter is None else self.formatter + def _strify(x): + return _stringify(x, print_config.encoding) + formatter = _strify if self.formatter is None else self.formatter else: formatter = str if self.formatter is None else self.formatter @@ -668,7 +686,7 @@ def set_printoptions(precision=None, column_space=None, max_rows=None, max_columns=None, colheader_justify=None, max_colwidth=None, notebook_repr_html=None, date_dayfirst=None, date_yearfirst=None, - multi_sparse=None): + multi_sparse=None, encoding=None): """ Alter default behavior of DataFrame.toString @@ -716,6 +734,8 @@ def set_printoptions(precision=None, column_space=None, max_rows=None, print_config.date_yearfirst = date_yearfirst if multi_sparse is not None: print_config.multi_sparse = multi_sparse + if encoding is not None: + print_config.encoding = encoding def reset_printoptions(): print_config.reset() @@ -846,6 +866,9 @@ def __init__(self): self.date_dayfirst = False self.date_yearfirst = False self.multi_sparse = True + self.encoding = sys.getdefaultencoding() + if self.encoding == 'ascii': + self.encoding = 'UTF8' def reset(self): self.__init__() diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 0f02eba0229c7..1484a41b91bf3 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -150,6 +150,12 @@ def test_to_html_unicode(self): df = DataFrame({'A' : [u'\u03c3']}) df.to_html() + def test_nonunicode_nonascii_alignment(self): + df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) + rep_str = df.to_string() + lines = rep_str.split('\n') + self.assert_(len(lines[1]) == len(lines[2])) + def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})}) unicode(dm.to_string()) @@ -776,4 +782,3 @@ def test_misc(self): import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) -