From 03401c2849fb21029b641b91746057f898255248 Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 1 Oct 2012 23:53:29 +0200 Subject: [PATCH 01/27] BUG: index.format should accept unicode index names --- pandas/core/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 6443bbf01a4f2..69ad8c3eab358 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -382,7 +382,7 @@ def format(self, name=False): header = [] if name: - header.append(str(self.name) if self.name is not None else '') + header.append(unicode(self.name) if self.name is not None else '') if self.is_all_dates: zero_time = time(0, 0) From 7fdeccbf1bf5435dbbc73cad74f04e9a11100b1f Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 10 Oct 2012 21:52:49 +0200 Subject: [PATCH 02/27] CLN: Move test_console_encode out of wrong test class --- pandas/tests/test_common.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e2b0b918f0142..a57eeba3818da 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -172,6 +172,20 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) +def test_console_encode(): + """ + On Python 2, if sys.stdin.encoding is None (IPython with zmq frontend) + common.console_encode should encode things as utf-8. + """ + if py3compat.PY3: + raise nose.SkipTest + + with tm.stdin_encoding(encoding=None): + result = com.console_encode(u"\u05d0") + expected = u"\u05d0".encode('utf-8') + assert (result == expected) + + class TestTake(unittest.TestCase): def test_1d_with_out(self): @@ -309,20 +323,6 @@ def test_2d_float32(self): expected[:, [2, 4]] = np.nan tm.assert_almost_equal(result, expected) - def test_console_encode(self): - """ - On Python 2, if sys.stdin.encoding is None (IPython with zmq frontend) - common.console_encode should encode things as utf-8. - """ - if py3compat.PY3: - raise nose.SkipTest - - with tm.stdin_encoding(encoding=None): - result = com.console_encode(u"\u05d0") - expected = u"\u05d0".encode('utf-8') - self.assertEqual(result, expected) - - if __name__ == '__main__': nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], exit=False) From 2c086e43e51a870c595e68df09f4d10902068d51 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 01:27:27 +0200 Subject: [PATCH 03/27] TST: unless a file is pure ascii, you must specify an encoding --- pandas/tests/test_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 9d45273e78b9b..2668a41b790da 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -235,7 +235,7 @@ def test_unicode_problem_decoding_as_ascii(self): def test_string_repr_encoding(self): pth = curpath() filepath = os.path.join(pth, 'data', 'unicode_series.csv') - df = pandas.read_csv(filepath, header=None) + df = pandas.read_csv(filepath, header=None,encoding='latin1') repr(df) repr(df['X1']) From 5d03a6bea0f03306531a61436ca4b3d3afb9479b Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 6 Oct 2012 05:26:12 +0200 Subject: [PATCH 04/27] TST: remove fmt.test_to_string_force_unicode I think this is the wrong behaviour, and it breaks some future unicode fixes. the constructor should should complain that no encoding was specified when the input is not ascii. --- pandas/tests/test_format.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 2668a41b790da..468f3679079fc 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -133,12 +133,6 @@ def test_to_string_unicode_three(self): buf = StringIO() dm.to_string(buf) - def test_to_string_force_unicode(self): - #given string with non-ascii characters - df = DataFrame([["aaää", 1], ["bbbb", 2]]) - result = df.to_string(force_unicode=True) - self.assertEqual(result, u' 0 1\n0 aa\xe4\xe4 1\n1 bbbb 2') - def test_to_string_with_formatters(self): df = DataFrame({'int': [1, 2, 3], 'float': [1.0, 2.0, 3.0], From d3c062b94fb289f35dd344c85bbf8b6358eb8fff Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 8 Oct 2012 22:15:00 +0200 Subject: [PATCH 05/27] CLN: Move _is_sequence() from pd.frame to pd.common, with other is_* --- pandas/core/common.py | 8 ++++++++ pandas/core/frame.py | 10 +--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 668017c29c6ab..4c2db78c4322c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -778,6 +778,14 @@ def is_float_dtype(arr_or_dtype): def is_list_like(arg): return hasattr(arg, '__iter__') and not isinstance(arg, basestring) +def _is_sequence(x): + try: + iter(x) + assert(not isinstance(x, basestring)) + return True + except Exception: + return False + _ensure_float64 = _algos.ensure_float64 _ensure_int64 = _algos.ensure_int64 _ensure_int32 = _algos.ensure_int32 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 91005ead01a24..8b46aeb463241 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -25,7 +25,7 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _stringify) + _default_index,_stringify,_is_sequence) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels @@ -5106,14 +5106,6 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) -def _is_sequence(x): - try: - iter(x) - assert(not isinstance(x, basestring)) - return True - except Exception: - return False - def install_ipython_completers(): # pragma: no cover """Register the DataFrame type with IPython's tab completion machinery, so that it knows about accessing column names as attributes.""" From 4c337f56a6d87476d20de9277df65ac0be8da62b Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 4 Oct 2012 06:04:55 +0200 Subject: [PATCH 06/27] TST: add test for _is_sequence() --- pandas/tests/test_common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index a57eeba3818da..48519bf8335be 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -14,6 +14,14 @@ from pandas.util import py3compat +def test_is_sequence(): + is_seq=com._is_sequence + assert(is_seq((1,2))) + assert(is_seq([1,2])) + assert(not is_seq("abcd")) + assert(not is_seq(u"abcd")) + assert(not is_seq(np.int64)) + def test_notnull(): assert notnull(1.) assert not notnull(None) From cbeff93c9ac027d64d711adaaba454116722a064 Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 4 Oct 2012 23:59:52 +0200 Subject: [PATCH 07/27] ENH: rework console encoding detection in fmt.print_config --- pandas/core/format.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index dca1976be838f..53d12e5e05a88 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1099,9 +1099,31 @@ def __init__(self): self.date_dayfirst = False self.date_yearfirst = False self.multi_sparse = True - self.encoding = sys.getdefaultencoding() - if self.encoding == 'ascii': - self.encoding = 'UTF8' + self.encoding = self.detect_encoding() + + def detect_encoding(self): + """ + Try to find the most capable encoding supported by the console. + slighly modified from the way IPython handles the same issue. + """ + import locale + + encoding = None + try: + encoding=sys.stdin.encoding + except AttributeError: + pass + + if not encoding or encoding =='ascii': # try again for something better + try: + encoding = locale.getpreferredencoding() + except Exception: + pass + + if not encoding: # when all else fails. this will usually be "ascii" + encoding = sys.getdefaultencoding() + + return encoding def reset(self): self.__init__() From d859d15d348a85057b20c57918ec0a9b85b0ff3a Mon Sep 17 00:00:00 2001 From: y-p Date: Wed, 10 Oct 2012 19:22:13 +0200 Subject: [PATCH 08/27] ENH: Add helpers to pd.common: pprint_thing/_encoded(),console_encode() --- pandas/core/common.py | 101 +++++++++++++++++++++++++++++++++++++----- pandas/core/format.py | 1 + 2 files changed, 92 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 4c2db78c4322c..7ab05257860ea 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -867,16 +867,6 @@ def load(path): finally: f.close() -def console_encode(value): - if py3compat.PY3 or not isinstance(value, unicode): - return value - - try: - import sys - return value.encode(sys.stdin.encoding or 'utf-8', 'replace') - except (AttributeError, TypeError): - return value.encode('ascii', 'replace') - class UTF8Recoder: """ Iterator that reads an encoded stream and reencodes the input to UTF-8 @@ -968,3 +958,94 @@ def _concat_compat(to_concat, axis=0): return new_values.view(_NS_DTYPE) else: return np.concatenate(to_concat, axis=axis) + +# Unicode consolidation +# --------------------- +# +# pprinting utility functions for generating Unicode text or bytes(3.x)/str(2.x) +# representations of objects. +# Try to use these as much as possible rather then rolling your own. +# +# When to use +# ----------- +# +# 1) If you're writing code internal to pandas (no I/O directly involved), +# use pprint_thing(). +# +# It will always return unicode text which can handled by other +# parts of the package without breakage. +# +# 2) If you need to send something to the console, use console_encode(). +# +# console_encode() should (hopefully) choose the right encoding for you +# based on the encoding set in fmt.print_config.encoding. +# +# 3) if you need to write something out to file, use pprint_thing_encoded(encoding). +# +# If no encoding is specified, it defaults to utf-8. SInce encoding pure ascii with +# utf-8 is a no-op you can safely use the default utf-8 if you're working with +# straight ascii. + +def _pprint_seq(seq,_nest_lvl=0): + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather then calling this directly. + """ + fmt=u"[%s]" if hasattr(seq,'__setitem__') else u"(%s)" + return fmt % ", ".join(pprint_thing(e,_nest_lvl+1) for e in seq) + +def pprint_thing(thing,_nest_lvl=0): + """ + This function is the sanctioned way of converting objects + to a unicode representation. + + properly handles nested sequences containing unicode strings + (unicode(object) does not) + + Parameters + ---------- + thing : anything to be formatted + _nest_lvl : internal use only. pprint_thing() is mutually-recursive + with pprint_sequence, this argument is used to keep track of the + current nesting level, and limit it. + + Returns + ------- + result - unicode object on py2, str on py3. Always Unicode. + + """ + from pandas.core.format import print_config + if thing is None: + result = '' + elif _is_sequence(thing) and _nest_lvl < print_config.pprint_nest_depth: + result = _pprint_seq(thing,_nest_lvl) + else: + # when used internally in the package, everything + # passed in should be a unicode object or have a unicode + # __str__. However as an aid to transition, we also accept + # utf8 encoded strings, if that's not it, we have no way + # to know, and the user should deal with it himself. + # so we resort to utf-8 with replacing errors + + try: + result = unicode(thing) # we should try this first + except UnicodeDecodeError: + # either utf-8 or we replace errors + result = str(thing).decode('utf-8',"replace") + + return unicode(result) # always unicode + +def pprint_thing_encoded(object,encoding='utf-8',errors='replace'): + value=pprint_thing(object) # get unicode representation of object + return value.encode(encoding, errors) + +def console_encode(object): + from pandas.core.format import print_config + """ + this is the sanctioned way to prepare something for + sending *to the console*, it delegates to pprint_thing() to get + a unicode representation of the object relies on the global encoding + set in print_config.encoding. Use this everywhere + where you output to the console. + """ + return pprint_thing_encoded(object,print_config.encoding) diff --git a/pandas/core/format.py b/pandas/core/format.py index 53d12e5e05a88..ebf720cb7b8f2 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1098,6 +1098,7 @@ def __init__(self): self.notebook_repr_html = True self.date_dayfirst = False self.date_yearfirst = False + self.pprint_nest_depth = 3 self.multi_sparse = True self.encoding = self.detect_encoding() From fb3e8e21e15ab3ceca190fbe27e9cd6023359cdc Mon Sep 17 00:00:00 2001 From: y-p Date: Sat, 6 Oct 2012 05:29:58 +0200 Subject: [PATCH 09/27] TST: add test_pprint_thing() --- pandas/tests/test_common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 48519bf8335be..586eb589568fa 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -193,6 +193,21 @@ def test_console_encode(): expected = u"\u05d0".encode('utf-8') assert (result == expected) +def test_pprint_thing(): + if py3compat.PY3: + raise nose.SkipTest + + pp_t=com.pprint_thing + + assert(pp_t('a')==u'a') + assert(pp_t(u'a')==u'a') + assert(pp_t(None)=='') + assert(pp_t(u'\u05d0')==u'\u05d0') + assert(pp_t((u'\u05d0',u'\u05d1'))==u'(\u05d0, \u05d1)') + assert(pp_t((u'\u05d0',(u'\u05d1',u'\u05d2')))== + u'(\u05d0, (\u05d1, \u05d2))') + assert(pp_t(('foo',u'\u05d0',(u'\u05d0',u'\u05d0')))== + u'(foo, \u05d0, (\u05d0, \u05d0))') class TestTake(unittest.TestCase): From 11dff0d925278680a193b87cf267d21869618d2e Mon Sep 17 00:00:00 2001 From: Wouter Overmeire Date: Thu, 11 Oct 2012 21:01:34 +0200 Subject: [PATCH 10/27] TST: Series repr fails when name is tuple holding non string-type #2051 --- pandas/tests/test_series.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 3a28401fb4f15..19a9d0fc09a63 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1011,9 +1011,16 @@ def test_repr(self): ots[::2] = None repr(ots) - # tuple name, e.g. from hierarchical index - self.series.name = ('foo', 'bar', 'baz') - repr(self.series) + # various names + for name in ['', 1, 1.2, 'foo', u'\u03B1\u03B2\u03B3', + 'loooooooooooooooooooooooooooooooooooooooooooooooooooong', + ('foo', 'bar', 'baz'), + (1, 2), + ('foo', 1, 2.3), + (u'\u03B1', u'\u03B2', u'\u03B3'), + (u'\u03B1', 'bar')]: + self.series.name = name + repr(self.series) biggie = Series(tm.randn(1000), index=np.arange(1000), name=('foo', 'bar', 'baz')) From 17d9c12862e96f8c1432f73ad4b93a8d32e443b7 Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 11 Oct 2012 22:44:35 +0200 Subject: [PATCH 11/27] ENH: SeriesFormatter footer repr now uses pprint_thing() fixes #2051 --- pandas/core/format.py | 19 +++++-------------- pandas/core/series.py | 2 +- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index ebf720cb7b8f2..3b1a0d818a06a 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -72,7 +72,7 @@ def __init__(self, series, buf=None, header=True, length=True, self.float_format = float_format def _get_footer(self): - footer = '' + footer = u'' if self.name: if getattr(self.series.index, 'freq', None): @@ -81,24 +81,15 @@ def _get_footer(self): if footer and self.series.name: footer += ', ' - if self.series.name: - if isinstance(self.series.name, basestring): - series_name = self.series.name - elif isinstance(self.series.name, tuple): - series_name = "('%s')" % "', '".join(self.series.name) - else: - series_name = str(self.series.name) - else: - series_name = self.series.name - - footer += (("Name: %s" % series_name) - if series_name is not None else '') + series_name = com.pprint_thing(self.series.name) + footer += ("Name: %s" % series_name) if self.series.name is not None else "" if self.length: if footer: footer += ', ' footer += 'Length: %d' % len(self.series) - return footer + + return unicode(footer) def _get_formatted_index(self): index = self.series.index diff --git a/pandas/core/series.py b/pandas/core/series.py index 7400aa5bde2e7..8dbda4cbb6fe9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -861,7 +861,7 @@ def _tidy_repr(self, max_vals=20): return '%s\n%s' % (result, self._repr_footer()) def _repr_footer(self): - namestr = "Name: %s, " % str(self.name) if self.name is not None else "" + namestr = "Name: %s, " % com.pprint_thing(self.name) if self.name is not None else "" return '%sLength: %d' % (namestr, len(self)) def to_string(self, buf=None, na_rep='NaN', float_format=None, From 15a78cfa1da0f2443a02b4db452b34f7e024e7cf Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 01:38:02 +0200 Subject: [PATCH 12/27] ENH: explicitly encode retval of SeriesFormatter.to_string() with console_encode() --- pandas/core/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index 3b1a0d818a06a..aac5bbe3b9c07 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -134,7 +134,7 @@ def to_string(self): if footer: result.append(footer) - return '\n'.join(result) + return com.console_encode(u'\n'.join(result)) if py3compat.PY3: # pragma: no cover _encode_diff = lambda x: 0 From 55b4631c787f84ddc27a4b533bea13efd8db3729 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 01:39:21 +0200 Subject: [PATCH 13/27] ENH: Index summary() and format() now delegate to pprint_thing() --- pandas/core/index.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 69ad8c3eab358..44155ddcf4831 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -196,7 +196,8 @@ def _has_complex_internals(self): def summary(self, name=None): if len(self) > 0: - index_summary = ', %s to %s' % (unicode(self[0]), unicode(self[-1])) + index_summary = ', %s to %s' % (com.pprint_thing(self[0]), + com.pprint_thing(self[-1])) else: index_summary = '' @@ -382,7 +383,7 @@ def format(self, name=False): header = [] if name: - header.append(unicode(self.name) if self.name is not None else '') + header.append(com.pprint_thing(self.name) if self.name is not None else '') if self.is_all_dates: zero_time = time(0, 0) From f24f7724a113370174f90b948c17a9ffa977cfe1 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 02:52:26 +0200 Subject: [PATCH 14/27] ENH: tseries.Index.summary() now delegates to pprint_thing() --- pandas/tseries/index.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index be7193418e82f..6d5fd6f560ffe 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -581,13 +581,15 @@ def groupby(self, f): def summary(self, name=None): if len(self) > 0: - index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) + index_summary = ', %s to %s' % (com.pprint_thing(self[0]), + com.pprint_thing(self[-1])) else: index_summary = '' if name is None: name = type(self).__name__ - result = '%s: %s entries%s' % (name, len(self), index_summary) + result = '%s: %s entries%s' % (com.pprint_thing(name), + len(self), index_summary) if self.freq: result += '\nFreq: %s' % self.freqstr From 00f2a979e47a2d77de3d4cdfce72d29f17fed7d0 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 02:49:03 +0200 Subject: [PATCH 15/27] BUG: TextReader._explicit_index_names() should allow for unicode index_name --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d1ede95527029..bf47fdc5e4695 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -749,7 +749,7 @@ def _explicit_index_names(self, columns): else: index_name = columns[self.index_col] - if index_name is not None and 'Unnamed' in str(index_name): + if index_name is not None and 'Unnamed' in unicode(index_name): index_name = None elif self.index_col is not None: From 0e1173023aee44ae455e0a5b1a3f444b507b78d5 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 13:49:12 +0200 Subject: [PATCH 16/27] BUG: parsers._concat_date_cols should accept unicode --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bf47fdc5e4695..db8c4a132d25b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1186,10 +1186,10 @@ def _get_col_names(colspec, columns): def _concat_date_cols(date_cols): if len(date_cols) == 1: - return np.array([str(x) for x in date_cols[0]], dtype=object) + return np.array([unicode(x) for x in date_cols[0]], dtype=object) # stripped = [map(str.strip, x) for x in date_cols] - rs = np.array([' '.join([str(y) for y in x]) + rs = np.array([' '.join([unicode(y) for y in x]) for x in zip(*date_cols)], dtype=object) return rs From 6a197cef9596614506fb729f00a7892c9e74b3f9 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 14:26:34 +0200 Subject: [PATCH 17/27] TST: test dataframe to_csv() with unicode index and columns --- pandas/tests/test_frame.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cf37de4294f3e..1cc62e3943433 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3481,6 +3481,18 @@ def test_to_csv_unicode(self): os.remove(path) + def test_to_csv_unicode_index_col(self): + buf=StringIO('') + df=DataFrame([[u"\u05d0","d2","d3","d4"],["a1","a2","a3","a4"]], + columns=[u"\u05d0",u"\u05d1",u"\u05d2",u"\u05d3"], + index=[u"\u05d0",u"\u05d1"]) + + df.to_csv(buf, encoding='UTF-8') + buf.seek(0) + + df2 = pan.read_csv(buf, index_col=0, encoding='UTF-8') + assert_frame_equal(df, df2) + def test_to_csv_stringio(self): buf = StringIO() self.frame.to_csv(buf) From a9896a62215ea968b503e311510cc4fd2fd2ac1b Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 14:26:51 +0200 Subject: [PATCH 18/27] TST: test series to_csv() with unicode index --- pandas/tests/test_series.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 19a9d0fc09a63..3e7934c01bb46 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2033,12 +2033,16 @@ def test_to_csv(self): os.remove('_foo') - def test_to_csv_stringio(self): - buf = StringIO() - self.ts.to_csv(buf, index=False) + def test_to_csv_unicode_index(self): + buf=StringIO() + s=Series([u"\u05d0","d2"], index=[u"\u05d0",u"\u05d1"]) + + s.to_csv(buf, encoding='UTF-8') buf.seek(0) - arr = np.loadtxt(buf) - assert_almost_equal(arr, self.ts.values) + + s2 = Series.from_csv(buf, index_col=0, encoding='UTF-8') + + assert_series_equal(s, s2) def test_to_dict(self): self.assert_(np.array_equal(Series(self.ts.to_dict()), self.ts)) From c9c0f95ae6e4056c62eee597fb9451ab80cc9c65 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 03:37:01 +0200 Subject: [PATCH 19/27] BUG: csvwriter writerow() now delegates to pprint_thing() for non-text objects --- pandas/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 7ab05257860ea..c4e90a0c5b4ca 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -932,7 +932,7 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): - row = [x if isinstance(x, basestring) else str(x) for x in row] + row = [x if isinstance(x, basestring) else pprint_thing(x) for x in row] self.writer.writerow([s.encode("utf-8") for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() From c907f6f44d34e94309ac26def5b0ee87f64a133a Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 8 Oct 2012 22:05:57 +0200 Subject: [PATCH 20/27] TST: add test for UnicodeWriter with csv.QUOTE_NONNUMERIC --- pandas/tests/test_frame.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1cc62e3943433..45ec1fa427410 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3530,6 +3530,23 @@ def test_to_csv_quoting(self): self.assertEqual(result, expected) + def test_to_csv_unicodewriter_quoting(self): + import csv + + df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, + encoding = 'utf-8') + + result = buf.getvalue() + expected = ('"A","B"\n' + '1,"foo"\n' + '2,"bar"\n' + '3,"baz"\n') + + self.assertEqual(result, expected) + def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) From 2e1001d166cf89337195d7444ba4d60cd7c61b0a Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 9 Oct 2012 01:02:45 +0200 Subject: [PATCH 21/27] ENH: add is_number() helper to pd.core.common --- pandas/core/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/common.py b/pandas/core/common.py index c4e90a0c5b4ca..086ceab0c1313 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -753,6 +753,9 @@ def is_iterator(obj): # python 3 generators have __next__ instead of next return hasattr(obj, 'next') or hasattr(obj, '__next__') +def is_number(obj): + return isinstance(obj, (np.number, int, long, float)) + def is_integer_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): tipo = arr_or_dtype.type From d115c864ca04ee875787eae193f8fcc6a8bb158c Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 9 Oct 2012 00:59:07 +0200 Subject: [PATCH 22/27] ENH: UnicodeWriter (CSV) now supports quoting=csv.QUOTE_NONNUMERIC --- pandas/core/common.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 086ceab0c1313..265f7fbc5367b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -933,10 +933,17 @@ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() + self.quoting=kwds.get("quoting",None) def writerow(self, row): - row = [x if isinstance(x, basestring) else pprint_thing(x) for x in row] - self.writer.writerow([s.encode("utf-8") for s in row]) + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and \ + is_number(x)) or isinstance(x, str) + + row = [x if _check_as_is(x) + else pprint_thing(x).encode('utf-8') for x in row] + + self.writer.writerow([s for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") From 6677d391900d57e5780a68cbc20bb16408f1ba38 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 2 Oct 2012 18:23:16 +0200 Subject: [PATCH 23/27] CLN: Expunge stringify_seq() in favor of pprint_thing() in Index.format() --- pandas/core/common.py | 5 ----- pandas/core/index.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 265f7fbc5367b..a9ee8bda05adc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -733,11 +733,6 @@ def _stringify(col, encoding='UTF8'): pass return console_encode(col) -def _stringify_seq(values): - if any(isinstance(x, unicode) for x in values): - return [_stringify(x) for x in values] - return [str(x) for x in values] - def _maybe_make_list(obj): if obj is not None and not isinstance(obj, (tuple, list)): return [obj] diff --git a/pandas/core/index.py b/pandas/core/index.py index 44155ddcf4831..67bb93efd0994 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -400,7 +400,7 @@ def format(self, name=False): values = lib.maybe_convert_objects(values, safe=1) if values.dtype == np.object_: - result = com._stringify_seq(values) + result = [com.pprint_thing(x) for x in values] else: result = _trim_front(format_array(values, None, justify='left')) return header + result From 7567b655a3d3219142e2cabc940cdf2190b91af9 Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 4 Oct 2012 02:47:34 +0200 Subject: [PATCH 24/27] BUG: Add checks to df,series repr() to handle python3 --- pandas/core/format.py | 2 ++ pandas/core/frame.py | 3 +++ pandas/core/series.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/pandas/core/format.py b/pandas/core/format.py index aac5bbe3b9c07..13dd189e9744e 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -134,6 +134,8 @@ def to_string(self): if footer: result.append(footer) + if py3compat.PY3: + return unicode(u'\n'.join(result)) return com.console_encode(u'\n'.join(result)) if py3compat.PY3: # pragma: no cover diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8b46aeb463241..a131c9d402552 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -584,6 +584,9 @@ def __repr__(self): else: self.to_string(buf=buf) value = buf.getvalue() + + if py3compat.PY3: + return unicode(value) return com.console_encode(value) def _repr_html_(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index 8dbda4cbb6fe9..eca177c4c543b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -848,6 +848,8 @@ def __repr__(self): else: result = '%s' % ndarray.__repr__(self) + if py3compat.PY3: + return unicode(result) return com.console_encode(result) def _tidy_repr(self, max_vals=20): From d96850877947836c4617b93e3936143273a64637 Mon Sep 17 00:00:00 2001 From: y-p Date: Sun, 7 Oct 2012 23:23:19 +0200 Subject: [PATCH 25/27] TST: repr() should return type str() on py2 and py3 --- pandas/tests/test_format.py | 15 +++++++++++++++ pandas/tests/test_series.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 468f3679079fc..4ea62d695042a 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -82,6 +82,21 @@ def test_repr_truncation(self): fmt.print_config.max_colwidth = max_len + 2 self.assert_('...' not in repr(df)) + def test_repr_should_return_str (self): + """ + http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ + http://docs.python.org/reference/datamodel.html#object.__repr__ + "...The return value must be a string object." + + (str on py2.x, str (unicode) on py3) + + """ + data=[8,5,3,5] + index1=[u"\u03c3",u"\u03c4",u"\u03c5",u"\u03c6"] + cols=[u"\u03c8"] + df=DataFrame(data,columns=cols,index=index1) + self.assertTrue(type(df.__repr__() == str)) # both py2 / 3 + def test_to_string_repr_unicode(self): buf = StringIO() diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 3e7934c01bb46..1f1b3285fb22d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1057,6 +1057,20 @@ def test_repr_name_iterable_indexable(self): s.name = (u"\u05d0",) * 2 repr(s) + def test_repr_should_return_str (self): + """ + http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ + http://docs.python.org/reference/datamodel.html#object.__repr__ + "...The return value must be a string object." + + (str on py2.x, str (unicode) on py3) + + """ + data=[8,5,3,5] + index1=[u"\u03c3",u"\u03c4",u"\u03c5",u"\u03c6"] + df=Series(data,index=index1) + self.assertTrue(type(df.__repr__() == str)) # both py2 / 3 + def test_timeseries_repr_object_dtype(self): index = Index([datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object) From 95678ebfce0e46969da00b7de61f3b096b304fd7 Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 8 Oct 2012 20:48:58 +0200 Subject: [PATCH 26/27] ENH: Index.__repr__ now uses pprint_thing/_encoded(). result should not change, unless unicode is present. --- pandas/core/index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 67bb93efd0994..94f8737188674 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -12,6 +12,7 @@ import pandas.lib as lib import pandas._algos as _algos from pandas.lib import Timestamp +from pandas.util import py3compat __all__ = ['Index'] @@ -132,12 +133,11 @@ def _shallow_copy(self): return self.view() def __repr__(self): - try: - result = np.ndarray.__repr__(self) - except UnicodeEncodeError: - result = 'Index([%s])' % (', '.join([repr(x) for x in self])) - - return result + if py3compat.PY3: + prepr = com.pprint_thing(self) + else: + prepr = com.pprint_thing_encoded(self) + return 'Index(%s, dtype=%s)' % (prepr,self.dtype) def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, From 5fa2ae439f4e02e0d97ff05fbba16337685297f8 Mon Sep 17 00:00:00 2001 From: y-p Date: Thu, 4 Oct 2012 02:36:35 +0200 Subject: [PATCH 27/27] CLN: Abolish stringify and _strify in favor of pprint_thing() --- pandas/core/common.py | 12 --------- pandas/core/format.py | 9 ++----- pandas/core/frame.py | 6 ++--- pandas/core/index.py | 8 +----- pandas/tools/plotting.py | 54 ++++++++++++++++---------------------- pandas/tseries/plotting.py | 2 +- 6 files changed, 30 insertions(+), 61 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index a9ee8bda05adc..8e851c67176f1 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -721,18 +721,6 @@ def _index_labels_to_array(labels): return labels -def _stringify(col, encoding='UTF8'): - # unicode workaround - try: - return unicode(col) - except UnicodeError: - try: - if isinstance(col, str): - return col.decode(encoding) - except UnicodeError: - pass - return console_encode(col) - def _maybe_make_list(obj): if obj is not None and not isinstance(obj, (tuple, list)): return [obj] diff --git a/pandas/core/format.py b/pandas/core/format.py index 13dd189e9744e..7125feeeb3b1c 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -8,7 +8,7 @@ except: from io import StringIO -from pandas.core.common import adjoin, isnull, notnull, _stringify +from pandas.core.common import adjoin, isnull, notnull from pandas.core.index import MultiIndex, _ensure_index from pandas.util import py3compat @@ -717,12 +717,7 @@ def _format_strings(self, use_unicode=False): else: float_format = self.float_format - if use_unicode: - def _strify(x): - return _stringify(x, print_config.encoding) - formatter = _strify if self.formatter is None else self.formatter - else: - formatter = str if self.formatter is None else self.formatter + formatter = com.pprint_thing if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a131c9d402552..e20aba116ef04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -24,8 +24,8 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index,_stringify,_is_sequence) +from pandas.core.common import (isnull, notnull, PandasError, _try_sort,\ + _default_index,_is_sequence) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels @@ -1368,7 +1368,7 @@ def info(self, verbose=True, buf=None): # hack if verbose and len(self.columns) < 100: lines.append('Data columns:') - space = max([len(_stringify(k)) for k in self.columns]) + 4 + space = max([len(com.pprint_thing(k)) for k in self.columns]) + 4 counts = self.count() assert(len(cols) == len(counts)) for col, count in counts.iteritems(): diff --git a/pandas/core/index.py b/pandas/core/index.py index 94f8737188674..08d1c593d42ca 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1474,15 +1474,9 @@ def get_level_values(self, level): def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep='NaN'): - from pandas.core.common import _stringify - from pandas.core.format import print_config - def _strify(x): - return _stringify(x, print_config.encoding) - if len(self) == 0: return [] - stringified_levels = [] for lev, lab in zip(self.levels, self.labels): if len(lev) > 0: @@ -1497,7 +1491,7 @@ def _strify(x): level = [] if names: - level.append(_strify(name) if name is not None else '') + level.append(com.pprint_thing(name) if name is not None else '') level.extend(np.array(lev, dtype=object)) result_levels.append(level) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 60ed0c70d516b..34754a23ba5b4 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -207,7 +207,7 @@ def normalize(series): line = ax.scatter(to_plot[class_][0], to_plot[class_][1], color=random_color(class_), - label=com._stringify(class_), **kwds) + label=com.pprint_thing(class_), **kwds) ax.legend() ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) @@ -272,8 +272,8 @@ def random_color(column): f = function(row) y = [f(t) for t in x] label = None - if com._stringify(class_col[i]) not in used_legends: - label = com._stringify(class_col[i]) + if com.pprint_thing(class_col[i]) not in used_legends: + label = com.pprint_thing(class_col[i]) used_legends.add(label) ax.plot(x, y, color=random_color(class_col[i]), label=label) ax.legend(loc='upper right') @@ -378,8 +378,8 @@ def random_color(column): y = row label = None kls = class_col.iget_value(i) - if com._stringify(kls) not in used_legends: - label = com._stringify(kls) + if com.pprint_thing(kls) not in used_legends: + label = com.pprint_thing(kls) used_legends.add(label) ax.plot(x, y, color=random_color(kls), label=label, **kwds) @@ -671,7 +671,7 @@ def _adorn_subplots(self): self.axes[0].set_title(self.title) if self._need_to_set_index: - labels = [_stringify(key) for key in self.data.index] + labels = [com.pprint_thing(key) for key in self.data.index] labels = dict(zip(range(len(self.data.index)), labels)) for ax_ in self.axes: @@ -685,10 +685,10 @@ def legend_title(self): if not isinstance(self.data.columns, MultiIndex): name = self.data.columns.name if name is not None: - name = com._stringify(name) + name = com.pprint_thing(name) return name else: - stringified = map(com._stringify, + stringified = map(com.pprint_thing, self.data.columns.names) return ','.join(stringified) else: @@ -742,13 +742,13 @@ def _get_index_name(self): if isinstance(self.data.index, MultiIndex): name = self.data.index.names if any(x is not None for x in name): - name = ','.join([com._stringify(x) for x in name]) + name = ','.join([com.pprint_thing(x) for x in name]) else: name = None else: name = self.data.index.name if name is not None: - name = com._stringify(name) + name = com.pprint_thing(name) return name @@ -806,7 +806,7 @@ def _make_plot(self): ax = self._get_ax(i) style = self._get_style(i, label) - label = com._stringify(label) + label = com.pprint_thing(label) gkde = gaussian_kde(y) sample_range = max(y) - min(y) @@ -902,7 +902,7 @@ def _maybe_add_color(kwargs, style, i): _maybe_add_color(kwds, style, i) - label = _stringify(label) + label = com.pprint_thing(label).encode('utf-8') mask = com.isnull(y) if mask.any(): @@ -947,7 +947,7 @@ def to_leg_label(label, i): if isinstance(data, Series): ax = self._get_ax(0) #self.axes[0] style = self.style or '' - label = com._stringify(self.label) + label = com.pprint_thing(self.label) kwds = kwargs.copy() _maybe_add_color(kwds, style, 0) @@ -959,7 +959,7 @@ def to_leg_label(label, i): labels.append(leg_label) else: for i, col in enumerate(data.columns): - label = com._stringify(col) + label = com.pprint_thing(col) ax = self._get_ax(i) style = self._get_style(i, col) kwds = kwargs.copy() @@ -1097,7 +1097,7 @@ def _make_plot(self): K = self.nseries for i, (label, y) in enumerate(self._iter_data()): - label = com._stringify(label) + label = com.pprint_thing(label) kwds = self.kwds.copy() kwds['color'] = colors[i % len(colors)] @@ -1125,7 +1125,7 @@ def _make_plot(self): def _post_plot_logic(self): for ax in self.axes: - str_index = [_stringify(key) for key in self.data.index] + str_index = [com.pprint_thing(key) for key in self.data.index] name = self._get_index_name() if self.kind == 'bar': @@ -1359,7 +1359,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, def plot_group(grouped, ax): keys, values = zip(*grouped) - keys = [_stringify(x) for x in keys] + keys = [com.pprint_thing(x) for x in keys] values = [remove_na(v) for v in values] ax.boxplot(values, **kwds) if kwds.get('vert', 1): @@ -1394,7 +1394,7 @@ def plot_group(grouped, ax): cols = columns else: cols = data.columns - keys = [_stringify(x) for x in cols] + keys = [com.pprint_thing(x) for x in cols] # Return boxplot dict in single plot case @@ -1411,14 +1411,6 @@ def plot_group(grouped, ax): fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return ret - -def _stringify(x): - if isinstance(x, tuple): - return '|'.join(com._stringify(y) for y in x) - else: - return com._stringify(x) - - def format_date_labels(ax, rot): # mini version of autofmt_xdate try: @@ -1455,8 +1447,8 @@ def plot_group(group, ax): else: fig = ax.get_figure() plot_group(data, ax) - ax.set_ylabel(com._stringify(y)) - ax.set_xlabel(com._stringify(x)) + ax.set_ylabel(com.pprint_thing(y)) + ax.set_xlabel(com.pprint_thing(x)) ax.grid(grid) @@ -1620,7 +1612,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, for (key, group), ax in zip(grouped, axes): d = group.boxplot(ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, figsize=figsize, **kwds) - ax.set_title(_stringify(key)) + ax.set_title(com.pprint_thing(key)) ret[key] = d else: from pandas.tools.merge import concat @@ -1676,7 +1668,7 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, if numeric_only and isinstance(group, DataFrame): group = group._get_numeric_data() plotf(group, ax) - ax.set_title(com._stringify(key)) + ax.set_title(com.pprint_thing(key)) return fig, axes @@ -1710,7 +1702,7 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, gp_col = grouped[col] plotf(gp_col, ax) ax.set_title(col) - ax.set_xlabel(com._stringify(by)) + ax.set_xlabel(com.pprint_thing(by)) ax.grid(grid) byline = by[0] if len(by) == 1 else by diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 07905d25402cf..6f1772dd364a6 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -158,7 +158,7 @@ def _replot_ax(ax, freq, plotf, kwargs): ax._plot_data.append(series) args = _maybe_mask(series) lines.append(plotf(ax, *args, **kwds)[0]) - labels.append(com._stringify(series.name)) + labels.append(com.pprint_thing(series.name)) return lines, labels