Skip to content

Configurability of unicode/console encoding #1654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,11 +705,16 @@ def _index_labels_to_array(labels):

return labels

def _stringify(col):
def _stringify(col, encoding='UTF8'):
# unicode workaround
try:
return unicode(col)
except UnicodeError:
try:
if isinstance(col, str):
return col.decode(encoding)
except UnicodeError:
pass
return console_encode(col)

def _stringify_seq(values):
Expand Down Expand Up @@ -930,4 +935,3 @@ def _concat_compat(to_concat):
return new_values.view(_NS_DTYPE)
else:
return np.concatenate(to_concat)

33 changes: 28 additions & 5 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from itertools import izip
import sys

try:
from StringIO import StringIO
Expand Down Expand Up @@ -480,15 +481,30 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
self.justify = justify

def get_result(self):
if self._have_unicode():
if self._conv_unicode():
fmt_values = self._format_strings(use_unicode=True)
else:
fmt_values = self._format_strings(use_unicode=False)

return _make_fixed_width(fmt_values, self.justify)

def _have_unicode(self):
mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
def _conv_unicode(self):
#check if any needs and can be converted to nonascii encoding
def _nonascii(x):
if isinstance(x, unicode):
return True
try:
if isinstance(x, str):
x.decode('ascii')
return False
except UnicodeError:
try:
x.decode(print_config.encoding)
return True
except UnicodeError:
return False
return False
mask = lib.map_infer(self.values, _nonascii)
return mask.any()

def _format_strings(self, use_unicode=False):
Expand All @@ -501,7 +517,9 @@ def _format_strings(self, use_unicode=False):
float_format = self.float_format

if use_unicode:
formatter = _stringify if self.formatter is None else self.formatter
def _strify(x):
return _stringify(x, print_config.encoding)
formatter = _strify if self.formatter is None else self.formatter
else:
formatter = str if self.formatter is None else self.formatter

Expand Down Expand Up @@ -668,7 +686,7 @@ def set_printoptions(precision=None, column_space=None, max_rows=None,
max_columns=None, colheader_justify=None,
max_colwidth=None, notebook_repr_html=None,
date_dayfirst=None, date_yearfirst=None,
multi_sparse=None):
multi_sparse=None, encoding=None):
"""
Alter default behavior of DataFrame.toString

Expand Down Expand Up @@ -716,6 +734,8 @@ def set_printoptions(precision=None, column_space=None, max_rows=None,
print_config.date_yearfirst = date_yearfirst
if multi_sparse is not None:
print_config.multi_sparse = multi_sparse
if encoding is not None:
print_config.encoding = encoding

def reset_printoptions():
print_config.reset()
Expand Down Expand Up @@ -846,6 +866,9 @@ def __init__(self):
self.date_dayfirst = False
self.date_yearfirst = False
self.multi_sparse = True
self.encoding = sys.getdefaultencoding()
if self.encoding == 'ascii':
self.encoding = 'UTF8'

def reset(self):
self.__init__()
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ def test_to_html_unicode(self):
df = DataFrame({'A' : [u'\u03c3']})
df.to_html()

def test_nonunicode_nonascii_alignment(self):
df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]])
rep_str = df.to_string()
lines = rep_str.split('\n')
self.assert_(len(lines[1]) == len(lines[2]))

def test_unicode_problem_decoding_as_ascii(self):
dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})})
unicode(dm.to_string())
Expand Down Expand Up @@ -776,4 +782,3 @@ def test_misc(self):
import nose
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
exit=False)