From 2a96074b1311b33d3ef8f3b3d8e3029b02f9d2b7 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 9 Aug 2014 22:25:57 +0900 Subject: [PATCH] ENH: Data formatting with unicode length --- doc/source/options.rst | 53 +++++ doc/source/whatsnew/v0.17.0.txt | 31 +++ pandas/compat/__init__.py | 40 ++++ pandas/core/common.py | 34 ++- pandas/core/config_init.py | 15 ++ pandas/core/format.py | 152 +++++++------ pandas/core/index.py | 22 +- pandas/tests/test_categorical.py | 29 ++- pandas/tests/test_common.py | 94 ++++++++ pandas/tests/test_format.py | 353 ++++++++++++++++++++++++++++++- pandas/tests/test_index.py | 307 ++++++++++++++++++++++++++- 11 files changed, 1049 insertions(+), 81 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index fb57175f96eaa..46ff2b6e5c343 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -440,3 +440,56 @@ For instance: pd.reset_option('^display\.') To round floats on a case-by-case basis, you can also use :meth:`~pandas.Series.round` and :meth:`~pandas.DataFrame.round`. + +.. _options.east_asian_width: + +Unicode Formatting +------------------ + +.. warning:: + + Enabling this option will affect the performance for printing of DataFrame and Series (about 2 times slower). + Use only when it is actually required. + +Some East Asian countries use Unicode characters its width is corresponding to 2 alphabets. +If DataFrame or Series contains these characters, default output cannot be aligned properly. + +.. ipython:: python + + df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) + df + +Enable ``display.unicode.east_asian_width`` allows pandas to check each character's "East Asian Width" property. +These characters can be aligned properly by checking this property, but it takes longer time than standard ``len`` function. + +.. ipython:: python + + pd.set_option('display.unicode.east_asian_width', True) + df + +In addition, Unicode contains characters which width is "Ambiguous". These character's width should be either 1 or 2 depending on terminal setting or encoding. Because this cannot be distinguished from Python, ``display.unicode.ambiguous_as_wide`` option is added to handle this. + +By default, "Ambiguous" character's width, "¡" (inverted exclamation) in below example, is regarded as 1. + +.. note:: + + This should be aligned properly in terminal which uses monospaced font. + +.. ipython:: python + + df = pd.DataFrame({'a': ['xxx', u'¡¡'], 'b': ['yyy', u'¡¡']}) + df + +Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to regard these character's width as 2. Note that this option will be effective only when ``display.unicode.east_asian_width`` is enabled. Confirm starting position has been changed, but not aligned properly because the setting is mismatched with this environment. + +.. ipython:: python + + pd.set_option('display.unicode.ambiguous_as_wide', True) + df + +.. ipython:: python + :suppress: + + pd.set_option('display.unicode.east_asian_width', False) + pd.set_option('display.unicode.ambiguous_as_wide', False) + diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9990d2bd1c78d..59b69d22c3b62 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -49,6 +49,7 @@ Highlights include: - Support for reading SAS xport files, see :ref:`here ` - Documentation comparing SAS to *pandas*, see :ref:`here ` - Removal of the automatic TimeSeries broadcasting, deprecated since 0.8.0, see :ref:`here ` +- Display format with plain text can optionally align with Unicode East Asian Width, see :ref:`here ` - Compatibility with Python 3.5 (:issue:`11097`) - Compatibility with matplotlib 1.5.0 (:issue:`11111`) @@ -334,6 +335,36 @@ Google BigQuery Enhancements - The ``generate_bq_schema()`` function is now deprecated and will be removed in a future version (:issue:`11121`) - Update the gbq module to support Python 3 (:issue:`11094`). +.. _whatsnew_0170.east_asian_width: + +Display Alignemnt with Unicode East Asian Width +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + + Enabling this option will affect the performance for printing of DataFrame and Series (about 2 times slower). + Use only when it is actually required. + +Some East Asian countries use Unicode characters its width is corresponding to 2 alphabets. If DataFrame or Series contains these characters, default output cannot be aligned properly. The following options are added to enable precise handling for these characters. + +- ``display.unicode.east_asian_width``: Whether to use the Unicode East Asian Width to calculate the display text width. (:issue:`2612`) +- ``display.unicode.ambiguous_as_wide``: Whether to handle Unicode characters belong to Ambiguous as Wide. (:issue:`11102`) + +.. ipython:: python + + df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) + df + + pd.set_option('display.unicode.east_asian_width', True) + df + +For further details, see :ref:`here ` + +.. ipython:: python + :suppress: + + pd.set_option('display.unicode.east_asian_width', False) + .. _whatsnew_0170.enhancements.other: Other enhancements diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index bad7192047e19..ba5114dd7d8ba 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -35,6 +35,7 @@ from itertools import product import sys import types +from unicodedata import east_asian_width PY2 = sys.version_info[0] == 2 PY3 = (sys.version_info[0] >= 3) @@ -90,6 +91,7 @@ def lmap(*args, **kwargs): def lfilter(*args, **kwargs): return list(filter(*args, **kwargs)) + else: # Python 2 import re @@ -176,6 +178,11 @@ class to receive bound method # The license for this library can be found in LICENSES/SIX and the code can be # found at https://bitbucket.org/gutworth/six +# Definition of East Asian Width +# http://unicode.org/reports/tr11/ +# Ambiguous width can be changed by option +_EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1} + if PY3: string_types = str, integer_types = int, @@ -188,6 +195,20 @@ def u(s): def u_safe(s): return s + + def strlen(data, encoding=None): + # encoding is for compat with PY2 + return len(data) + + def east_asian_len(data, encoding=None, ambiguous_width=1): + """ + Calculate display width considering unicode East Asian Width + """ + if isinstance(data, text_type): + return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data]) + else: + return len(data) + else: string_types = basestring, integer_types = (int, long) @@ -204,6 +225,25 @@ def u_safe(s): except: return s + def strlen(data, encoding=None): + try: + data = data.decode(encoding) + except UnicodeError: + pass + return len(data) + + def east_asian_len(data, encoding=None, ambiguous_width=1): + """ + Calculate display width considering unicode East Asian Width + """ + if isinstance(data, text_type): + try: + data = data.decode(encoding) + except UnicodeError: + pass + return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data]) + else: + return len(data) string_and_binary_types = string_types + (binary_type,) diff --git a/pandas/core/common.py b/pandas/core/common.py index 2d403f904a446..2411925207696 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2149,21 +2149,33 @@ def _count_not_none(*args): -def adjoin(space, *lists): +def adjoin(space, *lists, **kwargs): """ Glues together two sets of strings using the amount of space requested. The idea is to prettify. - """ + + ---------- + space : int + number of spaces for padding + lists : str + list of str which being joined + strlen : callable + function used to calculate the length of each str. Needed for unicode + handling. + justfunc : callable + function used to justify str. Needed for unicode handling. + """ + strlen = kwargs.pop('strlen', len) + justfunc = kwargs.pop('justfunc', _justify) + out_lines = [] newLists = [] - lengths = [max(map(len, x)) + space for x in lists[:-1]] - + lengths = [max(map(strlen, x)) + space for x in lists[:-1]] # not the last one lengths.append(max(map(len, lists[-1]))) - maxLen = max(map(len, lists)) for i, lst in enumerate(lists): - nl = [x.ljust(lengths[i]) for x in lst] + nl = justfunc(lst, lengths[i], mode='left') nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) newLists.append(nl) toJoin = zip(*newLists) @@ -2171,6 +2183,16 @@ def adjoin(space, *lists): out_lines.append(_join_unicode(lines)) return _join_unicode(out_lines, sep='\n') +def _justify(texts, max_len, mode='right'): + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == 'left': + return [x.ljust(max_len) for x in texts] + elif mode == 'center': + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] def _join_unicode(lines, sep=''): try: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 03eaa45582bef..751a530ce73cc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -144,6 +144,17 @@ Deprecated. """ +pc_east_asian_width_doc = """ +: boolean + Whether to use the Unicode East Asian Width to calculate the display text width + Enabling this may affect to the performance (default: False) +""" +pc_ambiguous_as_wide_doc = """ +: boolean + Whether to handle Unicode characters belong to Ambiguous as Wide (width=2) + (default: False) +""" + pc_line_width_deprecation_warning = """\ line_width has been deprecated, use display.width instead (currently both are identical) @@ -282,6 +293,10 @@ def mpl_style_cb(key): pc_line_width_doc) cf.register_option('memory_usage', True, pc_memory_usage_doc, validator=is_instance_factory([type(None), bool])) + cf.register_option('unicode.east_asian_width', False, + pc_east_asian_width_doc, validator=is_bool) + cf.register_option('unicode.ambiguous_as_wide', False, + pc_east_asian_width_doc, validator=is_bool) cf.deprecate_option('display.line_width', msg=pc_line_width_deprecation_warning, diff --git a/pandas/core/format.py b/pandas/core/format.py index 0c1a3dbadbd86..5f12abb543513 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -138,6 +138,7 @@ def __init__(self, series, buf=None, length=True, header=True, float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype + self.adj = _get_adjustment() self._chk_truncate() @@ -221,22 +222,24 @@ def to_string(self): fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - maxlen = max(len(x) for x in fmt_index) # max index len + maxlen = max(self.adj.len(x) for x in fmt_index) # max index len pad_space = min(maxlen, 60) if self.truncate_v: n_header_rows = 0 row_num = self.tr_row_num - width = len(fmt_values[row_num-1]) + width = self.adj.len(fmt_values[row_num-1]) if width > 3: dot_str = '...' else: dot_str = '..' - dot_str = dot_str.center(width) + # Series uses mode=center because it has single value columns + # DataFrame uses mode=left + dot_str = self.adj.justify([dot_str], width, mode='center')[0] fmt_values.insert(row_num + n_header_rows, dot_str) fmt_index.insert(row_num + 1, '') - result = adjoin(3, *[fmt_index[1:], fmt_values]) + result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values]) if self.header and have_header: result = fmt_index[0] + '\n' + result @@ -247,19 +250,54 @@ def to_string(self): return compat.text_type(u('').join(result)) -def _strlen_func(): - if compat.PY3: # pragma: no cover - _strlen = len - else: - encoding = get_option("display.encoding") +class TextAdjustment(object): + + def __init__(self): + self.encoding = get_option("display.encoding") + + def len(self, text): + return compat.strlen(text, encoding=self.encoding) + + def justify(self, texts, max_len, mode='right'): + return com._justify(texts, max_len, mode=mode) + + def adjoin(self, space, *lists, **kwargs): + return com.adjoin(space, *lists, strlen=self.len, + justfunc=self.justify, **kwargs) + + +class EastAsianTextAdjustment(TextAdjustment): + + def __init__(self): + super(EastAsianTextAdjustment, self).__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + def len(self, text): + return compat.east_asian_len(text, encoding=self.encoding, + ambiguous_width=self.ambiguous_width) + + def justify(self, texts, max_len, mode='right'): + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == 'left': + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == 'center': + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] - def _strlen(x): - try: - return len(x.decode(encoding)) - except UnicodeError: - return len(x) - return _strlen +def _get_adjustment(): + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return EastAsianTextAdjustment() + else: + return TextAdjustment() class TableFormatter(object): @@ -338,6 +376,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.columns = frame.columns self._chk_truncate() + self.adj = _get_adjustment() def _chk_truncate(self): ''' @@ -414,7 +453,6 @@ def _to_str_columns(self): """ Render a DataFrame to a list of columns (as lists of strings). """ - _strlen = _strlen_func() frame = self.tr_frame # may include levels names also @@ -427,27 +465,23 @@ def _to_str_columns(self): for i, c in enumerate(frame): cheader = str_columns[i] max_colwidth = max(self.col_space or 0, - *(_strlen(x) for x in cheader)) - + *(self.adj.len(x) for x in cheader)) fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=max_colwidth) + minimum=max_colwidth, + adj=self.adj) - max_len = max(np.max([_strlen(x) for x in fmt_values]), + max_len = max(np.max([self.adj.len(x) for x in fmt_values]), max_colwidth) - if self.justify == 'left': - cheader = [x.ljust(max_len) for x in cheader] - else: - cheader = [x.rjust(max_len) for x in cheader] - + cheader = self.adj.justify(cheader, max_len, mode=self.justify) stringified.append(cheader + fmt_values) else: stringified = [] for i, c in enumerate(frame): fmt_values = self._format_col(i) fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=(self.col_space or 0)) + minimum=(self.col_space or 0), + adj=self.adj) stringified.append(fmt_values) @@ -461,13 +495,13 @@ def _to_str_columns(self): if truncate_h: col_num = self.tr_col_num - col_width = len(strcols[self.tr_size_col][0]) # infer from column header + col_width = self.adj.len(strcols[self.tr_size_col][0]) # infer from column header strcols.insert(self.tr_col_num + 1, ['...'.center(col_width)] * (len(str_index))) if truncate_v: n_header_rows = len(str_index) - len(frame) row_num = self.tr_row_num for ix, col in enumerate(strcols): - cwidth = len(strcols[ix][row_num]) # infer from above row + cwidth = self.adj.len(strcols[ix][row_num]) # infer from above row is_dot_col = False if truncate_h: is_dot_col = ix == col_num + 1 @@ -477,13 +511,13 @@ def _to_str_columns(self): my_str = '..' if ix == 0: - dot_str = my_str.ljust(cwidth) + dot_mode = 'left' elif is_dot_col: - cwidth = len(strcols[self.tr_size_col][0]) - dot_str = my_str.center(cwidth) + cwidth = self.adj.len(strcols[self.tr_size_col][0]) + dot_mode = 'center' else: - dot_str = my_str.rjust(cwidth) - + dot_mode = 'right' + dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] strcols[ix].insert(row_num + n_header_rows, dot_str) return strcols @@ -492,6 +526,7 @@ def to_string(self): Render a DataFrame to a console-friendly tabular output. """ from pandas import Series + frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: @@ -503,11 +538,11 @@ def to_string(self): else: strcols = self._to_str_columns() if self.line_width is None: # no need to wrap around just print the whole frame - text = adjoin(1, *strcols) + text = self.adj.adjoin(1, *strcols) elif not isinstance(self.max_cols, int) or self.max_cols > 0: # need to wrap around text = self._join_multiline(*strcols) else: # max_cols == 0. Try to fit frame to terminal - text = adjoin(1, *strcols).split('\n') + text = self.adj.adjoin(1, *strcols).split('\n') row_lens = Series(text).apply(len) max_len_col_ix = np.argmax(row_lens) max_len = row_lens[max_len_col_ix] @@ -535,7 +570,7 @@ def to_string(self): # and then generate string representation self._chk_truncate() strcols = self._to_str_columns() - text = adjoin(1, *strcols) + text = self.adj.adjoin(1, *strcols) self.buf.writelines(text) @@ -549,9 +584,9 @@ def _join_multiline(self, *strcols): strcols = list(strcols) if self.index: idx = strcols.pop(0) - lwidth -= np.array([len(x) for x in idx]).max() + adjoin_width + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - col_widths = [np.array([len(x) for x in col]).max() + col_widths = [np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 for col in strcols] col_bins = _binify(col_widths, lwidth) @@ -572,8 +607,7 @@ def _join_multiline(self, *strcols): row.append([' \\'] + [' '] * (nrows - 1)) else: row.append([' '] * nrows) - - str_lst.append(adjoin(adjoin_width, *row)) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) st = ed return '\n\n'.join(str_lst) @@ -776,11 +810,12 @@ def _get_formatted_index(self, frame): formatter=fmt) else: fmt_index = [index.format(name=show_index_names, formatter=fmt)] - fmt_index = [tuple(_make_fixed_width( - list(x), justify='left', minimum=(self.col_space or 0))) - for x in fmt_index] + fmt_index = [tuple(_make_fixed_width(list(x), justify='left', + minimum=(self.col_space or 0), + adj=self.adj)) + for x in fmt_index] - adjoined = adjoin(1, *fmt_index).split('\n') + adjoined = self.adj.adjoin(1, *fmt_index).split('\n') # empty space for columns if show_col_names: @@ -2222,13 +2257,16 @@ def _formatter(x): return _formatter -def _make_fixed_width(strings, justify='right', minimum=None): +def _make_fixed_width(strings, justify='right', minimum=None, + adj=None): + if len(strings) == 0 or justify == 'all': return strings - _strlen = _strlen_func() + if adj is None: + adj = _get_adjustment() - max_len = np.max([_strlen(x) for x in strings]) + max_len = np.max([adj.len(x) for x in strings]) if minimum is not None: max_len = max(minimum, max_len) @@ -2237,22 +2275,14 @@ def _make_fixed_width(strings, justify='right', minimum=None): if conf_max is not None and max_len > conf_max: max_len = conf_max - if justify == 'left': - justfunc = lambda self, x: self.ljust(x) - else: - justfunc = lambda self, x: self.rjust(x) - def just(x): - eff_len = max_len - if conf_max is not None: - if (conf_max > 3) & (_strlen(x) > max_len): - x = x[:eff_len - 3] + '...' - - return justfunc(x, eff_len) - - result = [just(x) for x in strings] + if (conf_max > 3) & (adj.len(x) > max_len): + x = x[:max_len - 3] + '...' + return x + strings = [just(x) for x in strings] + result = adj.justify(strings, max_len, mode=justify) return result diff --git a/pandas/core/index.py b/pandas/core/index.py index d64a20fc9563c..1daa0e1b52d02 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -488,7 +488,7 @@ def _format_data(self): """ Return the formatted data as a unicode string """ - from pandas.core.format import get_console_size + from pandas.core.format import get_console_size, _get_adjustment display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 @@ -502,14 +502,19 @@ def _format_data(self): formatter = self._formatter_func # do we want to justify (only do so for non-objects) - is_justify = not (self.inferred_type == 'string' or self.inferred_type == 'categorical' and is_object_dtype(self.categories)) + is_justify = not (self.inferred_type in ('string', 'unicode') or + (self.inferred_type == 'categorical' and + is_object_dtype(self.categories))) # are we a truncated display is_truncated = n > max_seq_items + # adj can optionaly handle unicode eastern asian width + adj = _get_adjustment() + def _extend_line(s, line, value, display_width, next_line_prefix): - if len(line.rstrip()) + len(value.rstrip()) >= display_width: + if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: s += line.rstrip() line = next_line_prefix line += value @@ -517,7 +522,7 @@ def _extend_line(s, line, value, display_width, next_line_prefix): def best_len(values): if values: - return max([len(x) for x in values]) + return max([adj.len(x) for x in values]) else: return 0 @@ -556,8 +561,10 @@ def best_len(values): word = head[i] + sep + ' ' summary, line = _extend_line(summary, line, word, display_width, space2) + if is_truncated: - summary += line + space2 + '...' + # remove trailing space of last line + summary += line.rstrip() + space2 + '...' line = space2 for i in range(len(tail)-1): @@ -4501,8 +4508,11 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, start=int(names), sentinel=sentinel) + if adjoin: - return com.adjoin(space, *result_levels).split('\n') + from pandas.core.format import _get_adjustment + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split('\n') else: return result_levels diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 9173c0a87f6c2..e97010e1cb552 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2,7 +2,7 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime -from pandas.compat import range, lrange, u +from pandas.compat import range, lrange, u, PY3 import os import pickle import re @@ -534,6 +534,33 @@ def test_print_none_width(self): with option_context("display.width", None): self.assertEqual(exp, repr(a)) + def test_unicode_print(self): + if PY3: + _rep = repr + else: + _rep = unicode + + c = pd.Categorical(['aaaaa', 'bb', 'cccc'] * 20) + expected = u"""[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] +Length: 60 +Categories (3, object): [aaaaa, bb, cccc]""" + self.assertEqual(_rep(c), expected) + + c = pd.Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20) + expected = u"""[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] +Length: 60 +Categories (3, object): [ああああ, いいいいい, ううううううう]""" + self.assertEqual(_rep(c), expected) + + # unicode option should not affect to Categorical, as it doesn't care the repr width + with option_context('display.unicode.east_asian_width', True): + + c = pd.Categorical([u'ああああ', u'いいいいい', u'ううううううう'] * 20) + expected = u"""[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] +Length: 60 +Categories (3, object): [ああああ, いいいいい, ううううううう]""" + self.assertEqual(_rep(c), expected) + def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index c488d22da7dfe..003fd134cf210 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -14,6 +14,7 @@ from pandas.core.common import notnull, isnull, array_equivalent import pandas.core.common as com import pandas.core.convert as convert +import pandas.core.format as fmt import pandas.util.testing as tm import pandas.core.config as cf @@ -332,6 +333,99 @@ def test_adjoin(): assert(adjoined == expected) + +class TestFormattBase(tm.TestCase): + + def test_adjoin(self): + data = [['a', 'b', 'c'], + ['dd', 'ee', 'ff'], + ['ggg', 'hhh', 'iii']] + expected = 'a dd ggg\nb ee hhh\nc ff iii' + + adjoined = com.adjoin(2, *data) + + self.assertEqual(adjoined, expected) + + def test_adjoin_unicode(self): + data = [[u'あ', 'b', 'c'], + ['dd', u'ええ', 'ff'], + ['ggg', 'hhh', u'いいい']] + expected = u'あ dd ggg\nb ええ hhh\nc ff いいい' + adjoined = com.adjoin(2, *data) + self.assertEqual(adjoined, expected) + + adj = fmt.EastAsianTextAdjustment() + + expected = u"""あ dd ggg +b ええ hhh +c ff いいい""" + adjoined = adj.adjoin(2, *data) + self.assertEqual(adjoined, expected) + cols = adjoined.split('\n') + self.assertEqual(adj.len(cols[0]), 13) + self.assertEqual(adj.len(cols[1]), 13) + self.assertEqual(adj.len(cols[2]), 16) + + expected = u"""あ dd ggg +b ええ hhh +c ff いいい""" + adjoined = adj.adjoin(7, *data) + self.assertEqual(adjoined, expected) + cols = adjoined.split('\n') + self.assertEqual(adj.len(cols[0]), 23) + self.assertEqual(adj.len(cols[1]), 23) + self.assertEqual(adj.len(cols[2]), 26) + + def test_justify(self): + adj = fmt.EastAsianTextAdjustment() + + def just(x, *args, **kwargs): + # wrapper to test single str + return adj.justify([x], *args, **kwargs)[0] + + self.assertEqual(just('abc', 5, mode='left'), 'abc ') + self.assertEqual(just('abc', 5, mode='center'), ' abc ') + self.assertEqual(just('abc', 5, mode='right'), ' abc') + self.assertEqual(just(u'abc', 5, mode='left'), 'abc ') + self.assertEqual(just(u'abc', 5, mode='center'), ' abc ') + self.assertEqual(just(u'abc', 5, mode='right'), ' abc') + + self.assertEqual(just(u'パンダ', 5, mode='left'), u'パンダ') + self.assertEqual(just(u'パンダ', 5, mode='center'), u'パンダ') + self.assertEqual(just(u'パンダ', 5, mode='right'), u'パンダ') + + self.assertEqual(just(u'パンダ', 10, mode='left'), u'パンダ ') + self.assertEqual(just(u'パンダ', 10, mode='center'), u' パンダ ') + self.assertEqual(just(u'パンダ', 10, mode='right'), u' パンダ') + + def test_east_asian_len(self): + adj = fmt.EastAsianTextAdjustment() + + self.assertEqual(adj.len('abc'), 3) + self.assertEqual(adj.len(u'abc'), 3) + + self.assertEqual(adj.len(u'パンダ'), 6) + self.assertEqual(adj.len(u'パンダ'), 5) + self.assertEqual(adj.len(u'パンダpanda'), 11) + self.assertEqual(adj.len(u'パンダpanda'), 10) + + + def test_ambiguous_width(self): + adj = fmt.EastAsianTextAdjustment() + self.assertEqual(adj.len(u'¡¡ab'), 4) + + with cf.option_context('display.unicode.ambiguous_as_wide', True): + adj = fmt.EastAsianTextAdjustment() + self.assertEqual(adj.len(u'¡¡ab'), 6) + + data = [[u'あ', 'b', 'c'], + ['dd', u'ええ', 'ff'], + ['ggg', u'¡¡ab', u'いいい']] + expected = u'あ dd ggg \nb ええ ¡¡ab\nc ff いいい' + adjoined = adj.adjoin(2, *data) + self.assertEqual(adjoined, expected) + + def test_iterpairs(): data = [1, 2, 3, 4] expected = [(1, 2), diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 58c365029a694..b5220c8cb2706 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -162,10 +162,10 @@ def test_repr_truncation(self): r = repr(df) r = r[r.find('\n') + 1:] - _strlen = fmt._strlen_func() + adj = fmt._get_adjustment() for line, value in lzip(r.split('\n'), df['B']): - if _strlen(value) + 1 > max_len: + if adj.len(value) + 1 > max_len: self.assertIn('...', line) else: self.assertNotIn('...', line) @@ -438,6 +438,209 @@ def test_to_string_with_formatters_unicode(self): self.assertEqual(result, u(' c/\u03c3\n') + '0 1\n1 2\n2 3') + def test_east_asian_unicode_frame(self): + if PY3: + _rep = repr + else: + _rep = unicode + + # not alighned properly because of east asian width + + # mid col + df = DataFrame({'a': [u'あ', u'いいい', u'う', u'ええええええ'], + 'b': [1, 222, 33333, 4]}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na あ 1\n" + u"bb いいい 222\nc う 33333\n" + u"ddd ええええええ 4") + self.assertEqual(_rep(df), expected) + + # last col + df = DataFrame({'a': [1, 222, 33333, 4], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na 1 あ\n" + u"bb 222 いいい\nc 33333 う\n" + u"ddd 4 ええええええ") + self.assertEqual(_rep(df), expected) + + # all col + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na あああああ あ\n" + u"bb い いいい\nc う う\n" + u"ddd えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # column name + df = DataFrame({u'あああああ': [1, 222, 33333, 4], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" b あああああ\na あ 1\n" + u"bb いいい 222\nc う 33333\n" + u"ddd ええええええ 4") + self.assertEqual(_rep(df), expected) + + # index + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=[u'あああ', u'いいいいいい', u'うう', u'え']) + expected = (u" a b\nあああ あああああ あ\n" + u"いいいいいい い いいい\nうう う う\n" + u"え えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # index name + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=pd.Index([u'あ', u'い', u'うう', u'え'], name=u'おおおお')) + expected = (u" a b\nおおおお \nあ あああああ あ\n" + u"い い いいい\nうう う う\nえ えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # all + df = DataFrame({u'あああ': [u'あああ', u'い', u'う', u'えええええ'], + u'いいいいい': [u'あ', u'いいい', u'う', u'ええ']}, + index=pd.Index([u'あ', u'いいい', u'うう', u'え'], name=u'お')) + expected = (u" あああ いいいいい\nお \nあ あああ あ\n" + u"いいい い いいい\nうう う う\nえ えええええ ええ") + self.assertEqual(_rep(df), expected) + + # MultiIndex + idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), + (u'おおお', u'かかかか'), (u'き', u'くく')]) + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, index=idx) + expected = (u" a b\nあ いい あああああ あ\n" + u"う え い いいい\nおおお かかかか う う\n" + u"き くく えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # truncate + with option_context('display.max_rows', 3, 'display.max_columns', 3): + df = pd.DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ'], + 'c': [u'お', u'か', u'ききき', u'くくくくくく'], + u'ああああ': [u'さ', u'し', u'す', u'せ']}, + columns=['a', 'b', 'c', u'ああああ']) + + expected = (u" a ... ああああ\n0 あああああ ... さ\n" + u".. ... ... ...\n3 えええ ... せ\n" + u"\n[4 rows x 4 columns]") + self.assertEqual(_rep(df), expected) + + df.index = [u'あああ', u'いいいい', u'う', 'aaa'] + expected = (u" a ... ああああ\nあああ あああああ ... さ\n" + u".. ... ... ...\naaa えええ ... せ\n" + u"\n[4 rows x 4 columns]") + self.assertEqual(_rep(df), expected) + + # Emable Unicode option ----------------------------------------- + with option_context('display.unicode.east_asian_width', True): + + # mid col + df = DataFrame({'a': [u'あ', u'いいい', u'う', u'ええええええ'], + 'b': [1, 222, 33333, 4]}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na あ 1\n" + u"bb いいい 222\nc う 33333\n" + u"ddd ええええええ 4") + self.assertEqual(_rep(df), expected) + + # last col + df = DataFrame({'a': [1, 222, 33333, 4], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na 1 あ\n" + u"bb 222 いいい\nc 33333 う\n" + u"ddd 4 ええええええ") + self.assertEqual(_rep(df), expected) + + # all col + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" a b\na あああああ あ\n" + u"bb い いいい\nc う う\n" + u"ddd えええ ええええええ""") + self.assertEqual(_rep(df), expected) + + # column name + df = DataFrame({u'あああああ': [1, 222, 33333, 4], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=['a', 'bb', 'c', 'ddd']) + expected = (u" b あああああ\na あ 1\n" + u"bb いいい 222\nc う 33333\n" + u"ddd ええええええ 4") + self.assertEqual(_rep(df), expected) + + # index + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=[u'あああ', u'いいいいいい', u'うう', u'え']) + expected = (u" a b\nあああ あああああ あ\n" + u"いいいいいい い いいい\nうう う う\n" + u"え えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # index name + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + index=pd.Index([u'あ', u'い', u'うう', u'え'], name=u'おおおお')) + expected = (u" a b\nおおおお \n" + u"あ あああああ あ\nい い いいい\n" + u"うう う う\nえ えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # all + df = DataFrame({u'あああ': [u'あああ', u'い', u'う', u'えええええ'], + u'いいいいい': [u'あ', u'いいい', u'う', u'ええ']}, + index=pd.Index([u'あ', u'いいい', u'うう', u'え'], name=u'お')) + expected = (u" あああ いいいいい\nお \n" + u"あ あああ あ\nいいい い いいい\n" + u"うう う う\nえ えええええ ええ") + self.assertEqual(_rep(df), expected) + + # MultiIndex + idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), + (u'おおお', u'かかかか'), (u'き', u'くく')]) + df = DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, index=idx) + expected = (u" a b\nあ いい あああああ あ\n" + u"う え い いいい\nおおお かかかか う う\n" + u"き くく えええ ええええええ") + self.assertEqual(_rep(df), expected) + + # truncate + with option_context('display.max_rows', 3, 'display.max_columns', 3): + + df = pd.DataFrame({'a': [u'あああああ', u'い', u'う', u'えええ'], + 'b': [u'あ', u'いいい', u'う', u'ええええええ'], + 'c': [u'お', u'か', u'ききき', u'くくくくくく'], + u'ああああ': [u'さ', u'し', u'す', u'せ']}, + columns=['a', 'b', 'c', u'ああああ']) + + expected = (u" a ... ああああ\n0 あああああ ... さ\n" + u".. ... ... ...\n3 えええ ... せ\n" + u"\n[4 rows x 4 columns]") + self.assertEqual(_rep(df), expected) + + df.index = [u'あああ', u'いいいい', u'う', 'aaa'] + expected = (u" a ... ああああ\nあああ あああああ ... さ\n" + u"... ... ... ...\naaa えええ ... せ\n" + u"\n[4 rows x 4 columns]") + self.assertEqual(_rep(df), expected) + + # ambiguous unicode + df = DataFrame({u'あああああ': [1, 222, 33333, 4], + 'b': [u'あ', u'いいい', u'¡¡', u'ええええええ']}, + index=['a', 'bb', 'c', '¡¡¡']) + expected = (u" b あああああ\na あ 1\n" + u"bb いいい 222\nc ¡¡ 33333\n" + u"¡¡¡ ええええええ 4") + self.assertEqual(_rep(df), expected) + def test_to_string_buffer_all_unicode(self): buf = StringIO() @@ -895,10 +1098,6 @@ def test_to_html_regression_GH6098(self): # it works df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() - - - - def test_to_html_truncate(self): raise nose.SkipTest("unreliable on travis") index = pd.DatetimeIndex(start='20010101',freq='D',periods=20) @@ -2888,6 +3087,148 @@ def test_unicode_name_in_footer(self): sf = fmt.SeriesFormatter(s, name=u('\u05e2\u05d1\u05e8\u05d9\u05ea')) sf._get_footer() # should not raise exception + def test_east_asian_unicode_series(self): + if PY3: + _rep = repr + else: + _rep = unicode + # not alighned properly because of east asian width + + # unicode index + s = Series(['a', 'bb', 'CCC', 'D'], + index=[u'あ', u'いい', u'ううう', u'ええええ']) + expected = (u"あ a\nいい bb\nううう CCC\n" + u"ええええ D\ndtype: object") + self.assertEqual(_rep(s), expected) + + # unicode values + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], index=['a', 'bb', 'c', 'ddd']) + expected = (u"a あ\nbb いい\nc ううう\n" + u"ddd ええええ\ndtype: object") + self.assertEqual(_rep(s), expected) + + # both + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + index=[u'ああ', u'いいいい', u'う', u'えええ']) + expected = (u"ああ あ\nいいいい いい\nう ううう\n" + u"えええ ええええ\ndtype: object") + self.assertEqual(_rep(s), expected) + + # unicode footer + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + index=[u'ああ', u'いいいい', u'う', u'えええ'], + name=u'おおおおおおお') + expected = (u"ああ あ\nいいいい いい\nう ううう\n" + u"えええ ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + # MultiIndex + idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), + (u'おおお', u'かかかか'), (u'き', u'くく')]) + s = Series([1, 22, 3333, 44444], index=idx) + expected = (u"あ いい 1\nう え 22\nおおお かかかか 3333\n" + u"き くく 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # object dtype, shorter than unicode repr + s = Series([1, 22, 3333, 44444], index=[1, 'AB', np.nan, u'あああ']) + expected = (u"1 1\nAB 22\nNaN 3333\n" + u"あああ 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # object dtype, longer than unicode repr + s = Series([1, 22, 3333, 44444], + index=[1, 'AB', pd.Timestamp('2011-01-01'), u'あああ']) + expected = (u"1 1\nAB 22\n" + u"2011-01-01 00:00:00 3333\nあああ 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # truncate + with option_context('display.max_rows', 3): + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + name=u'おおおおおおお') + + expected = (u"0 あ\n ... \n" + u"3 ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + s.index = [u'ああ', u'いいいい', u'う', u'えええ'] + expected = (u"ああ あ\n ... \n" + u"えええ ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + # Emable Unicode option ----------------------------------------- + with option_context('display.unicode.east_asian_width', True): + + # unicode index + s = Series(['a', 'bb', 'CCC', 'D'], + index=[u'あ', u'いい', u'ううう', u'ええええ']) + expected = (u"あ a\nいい bb\nううう CCC\n" + u"ええええ D\ndtype: object") + self.assertEqual(_rep(s), expected) + + # unicode values + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], index=['a', 'bb', 'c', 'ddd']) + expected = (u"a あ\nbb いい\nc ううう\n" + u"ddd ええええ\ndtype: object") + self.assertEqual(_rep(s), expected) + + # both + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + index=[u'ああ', u'いいいい', u'う', u'えええ']) + expected = (u"ああ あ\nいいいい いい\nう ううう\n" + u"えええ ええええ\ndtype: object") + self.assertEqual(_rep(s), expected) + + # unicode footer + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + index=[u'ああ', u'いいいい', u'う', u'えええ'], + name=u'おおおおおおお') + expected = (u"ああ あ\nいいいい いい\nう ううう\n" + u"えええ ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + # MultiIndex + idx = pd.MultiIndex.from_tuples([(u'あ', u'いい'), (u'う', u'え'), + (u'おおお', u'かかかか'), (u'き', u'くく')]) + s = Series([1, 22, 3333, 44444], index=idx) + expected = (u"あ いい 1\nう え 22\nおおお かかかか 3333\n" + u"き くく 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # object dtype, shorter than unicode repr + s = Series([1, 22, 3333, 44444], index=[1, 'AB', np.nan, u'あああ']) + expected = (u"1 1\nAB 22\nNaN 3333\n" + u"あああ 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # object dtype, longer than unicode repr + s = Series([1, 22, 3333, 44444], + index=[1, 'AB', pd.Timestamp('2011-01-01'), u'あああ']) + expected = (u"1 1\nAB 22\n" + u"2011-01-01 00:00:00 3333\nあああ 44444\ndtype: int64") + self.assertEqual(_rep(s), expected) + + # truncate + with option_context('display.max_rows', 3): + s = Series([u'あ', u'いい', u'ううう', u'ええええ'], + name=u'おおおおおおお') + expected = (u"0 あ\n ... \n" + u"3 ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + s.index = [u'ああ', u'いいいい', u'う', u'えええ'] + expected = (u"ああ あ\n ... \n" + u"えええ ええええ\nName: おおおおおおお, dtype: object") + self.assertEqual(_rep(s), expected) + + # ambiguous unicode + s = Series([u'¡¡', u'い¡¡', u'ううう', u'ええええ'], + index=[u'ああ', u'¡¡¡¡いい', u'¡¡', u'えええ']) + expected = (u"ああ ¡¡\n¡¡¡¡いい い¡¡\n¡¡ ううう\n" + u"えええ ええええ\ndtype: object") + self.assertEqual(_rep(s), expected) + def test_float_trim_zeros(self): vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10, 2.03954217305e+10, 5.59897817305e+10] diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 75daabe2dab67..81ebc7efdbdd9 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2,7 +2,7 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta, time -from pandas.compat import range, lrange, lzip, u, zip +from pandas.compat import range, lrange, lzip, u, zip, PY3 import operator import re import nose @@ -1842,6 +1842,137 @@ def test_conversion_preserves_name(self): self.assertEqual(i.name, pd.to_datetime(i).name) self.assertEqual(i.name, pd.to_timedelta(i).name) + def test_string_index_repr(self): + # py3/py2 repr can differ because of "u" prefix + # which also affects to displayed element size + + # short + idx = pd.Index(['a', 'bb', 'ccc']) + if PY3: + expected = u"""Index(['a', 'bb', 'ccc'], dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'a', u'bb', u'ccc'], dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.Index(['a', 'bb', 'ccc'] * 10) + if PY3: + expected = u"""Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', + 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', + 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', + u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', + u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'], + dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.Index(['a', 'bb', 'ccc'] * 100) + if PY3: + expected = u"""Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + dtype='object', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', + ... + u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'], + dtype='object', length=300)""" + self.assertEqual(unicode(idx), expected) + + # short + idx = pd.Index([u'あ', u'いい', u'ううう']) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], + dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + dtype='object', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + ... + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], + dtype='object', length=300)""" + self.assertEqual(unicode(idx), expected) + + # Emable Unicode option ----------------------------------------- + with cf.option_context('display.unicode.east_asian_width', True): + + # short + idx = pd.Index([u'あ', u'いい', u'ううう']) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + dtype='object')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], + dtype='object')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) + if PY3: + expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう'], + dtype='object', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', + ... + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう'], + dtype='object', length=300)""" + self.assertEqual(unicode(idx), expected) + + class TestCategoricalIndex(Base, tm.TestCase): _holder = CategoricalIndex @@ -2211,6 +2342,180 @@ def test_equals(self): self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b']).equals(list('aabca'))) self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b']).equals(list('aabca') + [np.nan])) + def test_string_categorical_index_repr(self): + # short + idx = pd.CategoricalIndex(['a', 'bb', 'ccc']) + if PY3: + expected = u"""CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'a', u'bb', u'ccc'], categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 10) + if PY3: + expected = u"""CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', + u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', + u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', + u'a', u'bb', u'ccc', u'a', u'bb', u'ccc'], + categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 100) + if PY3: + expected = u"""CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', + u'ccc', u'a', + ... + u'ccc', u'a', u'bb', u'ccc', u'a', u'bb', u'ccc', u'a', + u'bb', u'ccc'], + categories=[u'a', u'bb', u'ccc'], ordered=False, dtype='category', length=300)""" + self.assertEqual(unicode(idx), expected) + + # larger categories + idx = pd.CategoricalIndex(list('abcdefghijklmmo')) + if PY3: + expected = u"""CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'm', 'o'], + categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', + u'k', u'l', u'm', u'm', u'o'], + categories=[u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', ...], ordered=False, dtype='category')""" + + self.assertEqual(unicode(idx), expected) + + # short + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう']) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう'], categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 10) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', + u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], + categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 100) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', + ... + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう'], + categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category', length=300)""" + self.assertEqual(unicode(idx), expected) + + # larger categories + idx = pd.CategoricalIndex(list(u'あいうえおかきくけこさしすせそ')) + if PY3: + expected = u"""CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', + 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', u'け', u'こ', + u'さ', u'し', u'す', u'せ', u'そ'], + categories=[u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', ...], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # Emable Unicode option ----------------------------------------- + with cf.option_context('display.unicode.east_asian_width', True): + + # short + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう']) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう'], categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # multiple lines + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 10) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', u'いい', u'ううう'], + categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + + # truncated + idx = pd.CategoricalIndex([u'あ', u'いい', u'ううう'] * 100) + if PY3: + expected = u"""CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', + u'いい', u'ううう', u'あ', + ... + u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', + u'ううう', u'あ', u'いい', u'ううう'], + categories=[u'あ', u'いい', u'ううう'], ordered=False, dtype='category', length=300)""" + self.assertEqual(unicode(idx), expected) + + # larger categories + idx = pd.CategoricalIndex(list(u'あいうえおかきくけこさしすせそ')) + if PY3: + expected = u"""CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', + 'さ', 'し', 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" + self.assertEqual(repr(idx), expected) + else: + expected = u"""CategoricalIndex([u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', + u'け', u'こ', u'さ', u'し', u'す', u'せ', u'そ'], + categories=[u'あ', u'い', u'う', u'え', u'お', u'か', u'き', u'く', ...], ordered=False, dtype='category')""" + self.assertEqual(unicode(idx), expected) + class Numeric(Base):