Skip to content

Commit 00b31f1

Browse files
committed
BUG: try to convert non-unicode non-ascii characters in repr #1620
1 parent 8cc9826 commit 00b31f1

File tree

3 files changed

+40
-8
lines changed

3 files changed

+40
-8
lines changed

pandas/core/common.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -705,11 +705,16 @@ def _index_labels_to_array(labels):
705705

706706
return labels
707707

708-
def _stringify(col):
708+
def _stringify(col, encoding='UTF8'):
709709
# unicode workaround
710710
try:
711711
return unicode(col)
712712
except UnicodeError:
713+
try:
714+
if isinstance(col, str):
715+
return col.decode(encoding)
716+
except UnicodeError:
717+
pass
713718
return console_encode(col)
714719

715720
def _stringify_seq(values):
@@ -930,4 +935,3 @@ def _concat_compat(to_concat):
930935
return new_values.view(_NS_DTYPE)
931936
else:
932937
return np.concatenate(to_concat)
933-

pandas/core/format.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from itertools import izip
2+
import sys
23

34
try:
45
from StringIO import StringIO
@@ -480,15 +481,30 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
480481
self.justify = justify
481482

482483
def get_result(self):
483-
if self._have_unicode():
484+
if self._conv_unicode():
484485
fmt_values = self._format_strings(use_unicode=True)
485486
else:
486487
fmt_values = self._format_strings(use_unicode=False)
487488

488489
return _make_fixed_width(fmt_values, self.justify)
489490

490-
def _have_unicode(self):
491-
mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
491+
def _conv_unicode(self):
492+
#check if any needs and can be converted to nonascii encoding
493+
def _nonascii(x):
494+
if isinstance(x, unicode):
495+
return True
496+
try:
497+
if isinstance(x, str):
498+
x.decode('ascii')
499+
return False
500+
except UnicodeError:
501+
try:
502+
x.decode(print_config.encoding)
503+
return True
504+
except UnicodeError:
505+
return False
506+
return False
507+
mask = lib.map_infer(self.values, _nonascii)
492508
return mask.any()
493509

494510
def _format_strings(self, use_unicode=False):
@@ -501,7 +517,9 @@ def _format_strings(self, use_unicode=False):
501517
float_format = self.float_format
502518

503519
if use_unicode:
504-
formatter = _stringify if self.formatter is None else self.formatter
520+
def _strify(x):
521+
return _stringify(x, print_config.encoding)
522+
formatter = _strify if self.formatter is None else self.formatter
505523
else:
506524
formatter = str if self.formatter is None else self.formatter
507525

@@ -668,7 +686,7 @@ def set_printoptions(precision=None, column_space=None, max_rows=None,
668686
max_columns=None, colheader_justify=None,
669687
max_colwidth=None, notebook_repr_html=None,
670688
date_dayfirst=None, date_yearfirst=None,
671-
multi_sparse=None):
689+
multi_sparse=None, encoding=None):
672690
"""
673691
Alter default behavior of DataFrame.toString
674692
@@ -716,6 +734,8 @@ def set_printoptions(precision=None, column_space=None, max_rows=None,
716734
print_config.date_yearfirst = date_yearfirst
717735
if multi_sparse is not None:
718736
print_config.multi_sparse = multi_sparse
737+
if encoding is not None:
738+
print_config.encoding = encoding
719739

720740
def reset_printoptions():
721741
print_config.reset()
@@ -846,6 +866,9 @@ def __init__(self):
846866
self.date_dayfirst = False
847867
self.date_yearfirst = False
848868
self.multi_sparse = True
869+
self.encoding = sys.getdefaultencoding()
870+
if self.encoding == 'ascii':
871+
self.encoding = 'UTF8'
849872

850873
def reset(self):
851874
self.__init__()

pandas/tests/test_format.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,12 @@ def test_to_html_unicode(self):
150150
df = DataFrame({'A' : [u'\u03c3']})
151151
df.to_html()
152152

153+
def test_nonunicode_nonascii_alignment(self):
154+
df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]])
155+
rep_str = df.to_string()
156+
lines = rep_str.split('\n')
157+
self.assert_(len(lines[1]) == len(lines[2]))
158+
153159
def test_unicode_problem_decoding_as_ascii(self):
154160
dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})})
155161
unicode(dm.to_string())
@@ -776,4 +782,3 @@ def test_misc(self):
776782
import nose
777783
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
778784
exit=False)
779-

0 commit comments

Comments
 (0)