Skip to content

More Unicode, factor out pprinting of labels and names #2005

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from Oct 12, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
03401c2
BUG: index.format should accept unicode index names
Oct 1, 2012
7fdeccb
CLN: Move test_console_encode out of wrong test class
Oct 10, 2012
2c086e4
TST: unless a file is pure ascii, you must specify an encoding
Oct 1, 2012
5d03a6b
TST: remove fmt.test_to_string_force_unicode
Oct 6, 2012
d3c062b
CLN: Move _is_sequence() from pd.frame to pd.common, with other is_*
Oct 8, 2012
4c337f5
TST: add test for _is_sequence()
Oct 4, 2012
cbeff93
ENH: rework console encoding detection in fmt.print_config
Oct 4, 2012
d859d15
ENH: Add helpers to pd.common: pprint_thing/_encoded(),console_encode()
Oct 10, 2012
fb3e8e2
TST: add test_pprint_thing()
Oct 6, 2012
11dff0d
TST: Series repr fails when name is tuple holding non string-type #2051
lodagro Oct 11, 2012
17d9c12
ENH: SeriesFormatter footer repr now uses pprint_thing()
Oct 11, 2012
15a78cf
ENH: explicitly encode retval of SeriesFormatter.to_string() with con…
Oct 1, 2012
55b4631
ENH: Index summary() and format() now delegate to pprint_thing()
Oct 1, 2012
f24f772
ENH: tseries.Index.summary() now delegates to pprint_thing()
Oct 2, 2012
00f2a97
BUG: TextReader._explicit_index_names() should allow for unicode inde…
Oct 2, 2012
0e11730
BUG: parsers._concat_date_cols should accept unicode
Oct 2, 2012
6a197ce
TST: test dataframe to_csv() with unicode index and columns
Oct 2, 2012
a9896a6
TST: test series to_csv() with unicode index
Oct 2, 2012
c9c0f95
BUG: csvwriter writerow() now delegates to pprint_thing() for non-tex…
Oct 2, 2012
c907f6f
TST: add test for UnicodeWriter with csv.QUOTE_NONNUMERIC
Oct 8, 2012
2e1001d
ENH: add is_number() helper to pd.core.common
Oct 8, 2012
d115c86
ENH: UnicodeWriter (CSV) now supports quoting=csv.QUOTE_NONNUMERIC
Oct 8, 2012
6677d39
CLN: Expunge stringify_seq() in favor of pprint_thing() in Index.form…
Oct 2, 2012
7567b65
BUG: Add checks to df,series repr() to handle python3
Oct 4, 2012
d968508
TST: repr() should return type str() on py2 and py3
Oct 7, 2012
95678eb
ENH: Index.__repr__ now uses pprint_thing/_encoded().
Oct 8, 2012
5fa2ae4
CLN: Abolish stringify and _strify in favor of pprint_thing()
Oct 4, 2012
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 111 additions & 29 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,23 +721,6 @@ def _index_labels_to_array(labels):

return labels

def _stringify(col, encoding='UTF8'):
# unicode workaround
try:
return unicode(col)
except UnicodeError:
try:
if isinstance(col, str):
return col.decode(encoding)
except UnicodeError:
pass
return console_encode(col)

def _stringify_seq(values):
if any(isinstance(x, unicode) for x in values):
return [_stringify(x) for x in values]
return [str(x) for x in values]

def _maybe_make_list(obj):
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
Expand All @@ -753,6 +736,9 @@ def is_iterator(obj):
# python 3 generators have __next__ instead of next
return hasattr(obj, 'next') or hasattr(obj, '__next__')

def is_number(obj):
return isinstance(obj, (np.number, int, long, float))

def is_integer_dtype(arr_or_dtype):
if isinstance(arr_or_dtype, np.dtype):
tipo = arr_or_dtype.type
Expand All @@ -778,6 +764,14 @@ def is_float_dtype(arr_or_dtype):
def is_list_like(arg):
return hasattr(arg, '__iter__') and not isinstance(arg, basestring)

def _is_sequence(x):
try:
iter(x)
assert(not isinstance(x, basestring))
return True
except Exception:
return False

_ensure_float64 = _algos.ensure_float64
_ensure_int64 = _algos.ensure_int64
_ensure_int32 = _algos.ensure_int32
Expand Down Expand Up @@ -859,16 +853,6 @@ def load(path):
finally:
f.close()

def console_encode(value):
if py3compat.PY3 or not isinstance(value, unicode):
return value

try:
import sys
return value.encode(sys.stdin.encoding or 'utf-8', 'replace')
except (AttributeError, TypeError):
return value.encode('ascii', 'replace')

class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
Expand Down Expand Up @@ -932,10 +916,17 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
self.quoting=kwds.get("quoting",None)

def writerow(self, row):
row = [x if isinstance(x, basestring) else str(x) for x in row]
self.writer.writerow([s.encode("utf-8") for s in row])
def _check_as_is(x):
return (self.quoting == csv.QUOTE_NONNUMERIC and \
is_number(x)) or isinstance(x, str)

row = [x if _check_as_is(x)
else pprint_thing(x).encode('utf-8') for x in row]

self.writer.writerow([s for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
Expand All @@ -960,3 +951,94 @@ def _concat_compat(to_concat, axis=0):
return new_values.view(_NS_DTYPE)
else:
return np.concatenate(to_concat, axis=axis)

# Unicode consolidation
# ---------------------
#
# pprinting utility functions for generating Unicode text or bytes(3.x)/str(2.x)
# representations of objects.
# Try to use these as much as possible rather then rolling your own.
#
# When to use
# -----------
#
# 1) If you're writing code internal to pandas (no I/O directly involved),
# use pprint_thing().
#
# It will always return unicode text which can handled by other
# parts of the package without breakage.
#
# 2) If you need to send something to the console, use console_encode().
#
# console_encode() should (hopefully) choose the right encoding for you
# based on the encoding set in fmt.print_config.encoding.
#
# 3) if you need to write something out to file, use pprint_thing_encoded(encoding).
#
# If no encoding is specified, it defaults to utf-8. SInce encoding pure ascii with
# utf-8 is a no-op you can safely use the default utf-8 if you're working with
# straight ascii.

def _pprint_seq(seq,_nest_lvl=0):
"""
internal. pprinter for iterables. you should probably use pprint_thing()
rather then calling this directly.
"""
fmt=u"[%s]" if hasattr(seq,'__setitem__') else u"(%s)"
return fmt % ", ".join(pprint_thing(e,_nest_lvl+1) for e in seq)

def pprint_thing(thing,_nest_lvl=0):
"""
This function is the sanctioned way of converting objects
to a unicode representation.

properly handles nested sequences containing unicode strings
(unicode(object) does not)

Parameters
----------
thing : anything to be formatted
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
with pprint_sequence, this argument is used to keep track of the
current nesting level, and limit it.

Returns
-------
result - unicode object on py2, str on py3. Always Unicode.

"""
from pandas.core.format import print_config
if thing is None:
result = ''
elif _is_sequence(thing) and _nest_lvl < print_config.pprint_nest_depth:
result = _pprint_seq(thing,_nest_lvl)
else:
# when used internally in the package, everything
# passed in should be a unicode object or have a unicode
# __str__. However as an aid to transition, we also accept
# utf8 encoded strings, if that's not it, we have no way
# to know, and the user should deal with it himself.
# so we resort to utf-8 with replacing errors

try:
result = unicode(thing) # we should try this first
except UnicodeDecodeError:
# either utf-8 or we replace errors
result = str(thing).decode('utf-8',"replace")

return unicode(result) # always unicode

def pprint_thing_encoded(object,encoding='utf-8',errors='replace'):
value=pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors)

def console_encode(object):
from pandas.core.format import print_config
"""
this is the sanctioned way to prepare something for
sending *to the console*, it delegates to pprint_thing() to get
a unicode representation of the object relies on the global encoding
set in print_config.encoding. Use this everywhere
where you output to the console.
"""
return pprint_thing_encoded(object,print_config.encoding)
61 changes: 36 additions & 25 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
except:
from io import StringIO

from pandas.core.common import adjoin, isnull, notnull, _stringify
from pandas.core.common import adjoin, isnull, notnull
from pandas.core.index import MultiIndex, _ensure_index
from pandas.util import py3compat

Expand Down Expand Up @@ -72,7 +72,7 @@ def __init__(self, series, buf=None, header=True, length=True,
self.float_format = float_format

def _get_footer(self):
footer = ''
footer = u''

if self.name:
if getattr(self.series.index, 'freq', None):
Expand All @@ -81,24 +81,15 @@ def _get_footer(self):
if footer and self.series.name:
footer += ', '

if self.series.name:
if isinstance(self.series.name, basestring):
series_name = self.series.name
elif isinstance(self.series.name, tuple):
series_name = "('%s')" % "', '".join(self.series.name)
else:
series_name = str(self.series.name)
else:
series_name = self.series.name

footer += (("Name: %s" % series_name)
if series_name is not None else '')
series_name = com.pprint_thing(self.series.name)
footer += ("Name: %s" % series_name) if self.series.name is not None else ""

if self.length:
if footer:
footer += ', '
footer += 'Length: %d' % len(self.series)
return footer

return unicode(footer)

def _get_formatted_index(self):
index = self.series.index
Expand Down Expand Up @@ -143,7 +134,9 @@ def to_string(self):
if footer:
result.append(footer)

return '\n'.join(result)
if py3compat.PY3:
return unicode(u'\n'.join(result))
return com.console_encode(u'\n'.join(result))

if py3compat.PY3: # pragma: no cover
_encode_diff = lambda x: 0
Expand Down Expand Up @@ -724,12 +717,7 @@ def _format_strings(self, use_unicode=False):
else:
float_format = self.float_format

if use_unicode:
def _strify(x):
return _stringify(x, print_config.encoding)
formatter = _strify if self.formatter is None else self.formatter
else:
formatter = str if self.formatter is None else self.formatter
formatter = com.pprint_thing if self.formatter is None else self.formatter

def _format(x):
if self.na_rep is not None and lib.checknull(x):
Expand Down Expand Up @@ -1098,10 +1086,33 @@ def __init__(self):
self.notebook_repr_html = True
self.date_dayfirst = False
self.date_yearfirst = False
self.pprint_nest_depth = 3
self.multi_sparse = True
self.encoding = sys.getdefaultencoding()
if self.encoding == 'ascii':
self.encoding = 'UTF8'
self.encoding = self.detect_encoding()

def detect_encoding(self):
"""
Try to find the most capable encoding supported by the console.
slighly modified from the way IPython handles the same issue.
"""
import locale

encoding = None
try:
encoding=sys.stdin.encoding
except AttributeError:
pass

if not encoding or encoding =='ascii': # try again for something better
try:
encoding = locale.getpreferredencoding()
except Exception:
pass

if not encoding: # when all else fails. this will usually be "ascii"
encoding = sys.getdefaultencoding()

return encoding

def reset(self):
self.__init__()
Expand Down
17 changes: 6 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
import numpy as np
import numpy.ma as ma

from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _stringify)
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,\
_default_index,_is_sequence)
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels
Expand Down Expand Up @@ -584,6 +584,9 @@ def __repr__(self):
else:
self.to_string(buf=buf)
value = buf.getvalue()

if py3compat.PY3:
return unicode(value)
return com.console_encode(value)

def _repr_html_(self):
Expand Down Expand Up @@ -1365,7 +1368,7 @@ def info(self, verbose=True, buf=None):
# hack
if verbose and len(self.columns) < 100:
lines.append('Data columns:')
space = max([len(_stringify(k)) for k in self.columns]) + 4
space = max([len(com.pprint_thing(k)) for k in self.columns]) + 4
counts = self.count()
assert(len(cols) == len(counts))
for col, count in counts.iteritems():
Expand Down Expand Up @@ -5106,14 +5109,6 @@ def _homogenize(data, index, columns, dtype=None):
def _put_str(s, space):
return ('%s' % s)[:space].ljust(space)

def _is_sequence(x):
try:
iter(x)
assert(not isinstance(x, basestring))
return True
except Exception:
return False

def install_ipython_completers(): # pragma: no cover
"""Register the DataFrame type with IPython's tab completion machinery, so
that it knows about accessing column names as attributes."""
Expand Down
Loading