diff --git a/doc/source/_static/trunc_after.png b/doc/source/_static/trunc_after.png new file mode 100644 index 0000000000000..950690de8d1ee Binary files /dev/null and b/doc/source/_static/trunc_after.png differ diff --git a/doc/source/_static/trunc_before.png b/doc/source/_static/trunc_before.png new file mode 100644 index 0000000000000..36ac203422e76 Binary files /dev/null and b/doc/source/_static/trunc_before.png differ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index c033debbb6808..d0696a0be156d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -23,6 +23,8 @@ users upgrade to this version. - :ref:`API Changes ` +- :ref:`Groupby API Changes ` + - :ref:`Performance Improvements ` - :ref:`Prior Deprecations ` @@ -216,6 +218,24 @@ API changes Display Changes ~~~~~~~~~~~~~~~ +- The default way of printing large DataFrames has changed. DataFrames + exceeding ``max_rows`` and/or ``max_columns`` are now displayed in a + centrally truncated view, consistent with the printing of a + :class:`pandas.Series` (:issue:`5603`). + + In previous versions, a DataFrame was truncated once the dimension + constraints were reached and an ellipse (...) signaled that part of + the data was cut off. + + .. image:: _static/trunc_before.png + :alt: The previous look of truncate. + + In the current version, large DataFrames are centrally truncated, + showing a preview of head and tail in both dimensions. + + .. image:: _static/trunc_after.png + :alt: The new look. + - allow option ``'truncate'`` for ``display.show_dimensions`` to only show the dimensions if the frame is truncated (:issue:`6547`). diff --git a/pandas/core/format.py b/pandas/core/format.py index 49e98fe9911c5..0905640c85ac1 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1,8 +1,10 @@ + #coding: utf-8 from __future__ import print_function # pylint: disable=W0141 import sys +import re from pandas.core.base import PandasObject from pandas.core.common import adjoin, isnull, notnull @@ -309,38 +311,65 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, else: self.columns = frame.columns + self._chk_truncate() + + def _chk_truncate(self): + from pandas.tools.merge import concat + + truncate_h = self.max_cols and (len(self.columns) > self.max_cols) + truncate_v = self.max_rows and (len(self.frame) > self.max_rows) + + # Cut the data to the information actually printed + max_cols = self.max_cols + max_rows = self.max_rows + frame = self.frame + if truncate_h: + if max_cols > 1: + col_num = (max_cols // 2) + frame = concat( (frame.iloc[:,:col_num],frame.iloc[:,-col_num:]),axis=1 ) + else: + col_num = max_cols + frame = frame.iloc[:,:max_cols] + self.tr_col_num = col_num + if truncate_v: + if max_rows > 1: + row_num = max_rows // 2 + frame = concat( (frame.iloc[:row_num,:],frame.iloc[-row_num:,:]) ) + else: + row_num = max_rows + frame = frame.iloc[:max_rows,:] + self.tr_row_num = row_num + + self.tr_frame = frame + self.truncate_h = truncate_h + self.truncate_v = truncate_v + self.is_truncated = self.truncate_h or self.truncate_v + def _to_str_columns(self): """ Render a DataFrame to a list of columns (as lists of strings). """ + _strlen = _strlen_func() + frame = self.tr_frame # may include levels names also - str_index = self._get_formatted_index() - str_columns = self._get_formatted_column_labels() - - _strlen = _strlen_func() + str_index = self._get_formatted_index(frame) - cols_to_show = self.columns[:self.max_cols] - self.truncated_h = truncate_h = self.max_cols and (len(self.columns) > self.max_cols) - self.truncated_v = truncate_v = self.max_rows and (len(self.frame) > self.max_rows) - self.is_truncated = self.truncated_h or self.truncated_v - if truncate_h: - cols_to_show = self.columns[:self.max_cols] - else: - cols_to_show = self.columns + str_columns = self._get_formatted_column_labels(frame) if self.header: stringified = [] - for i, c in enumerate(cols_to_show): - fmt_values = self._format_col(i) + col_headers = frame.columns + for i, c in enumerate(frame): cheader = str_columns[i] - max_colwidth = max(self.col_space or 0, *(_strlen(x) for x in cheader)) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=max_colwidth, - truncated=truncate_v) + minimum=max_colwidth) + max_len = max(np.max([_strlen(x) for x in fmt_values]), max_colwidth) @@ -351,16 +380,47 @@ def _to_str_columns(self): stringified.append(cheader + fmt_values) else: - stringified = [_make_fixed_width(self._format_col(i), self.justify, - truncated=truncate_v) - for i, c in enumerate(cols_to_show)] + stringified = [] + for i, c in enumerate(frame): + formatter = self._get_formatter(i) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width(fmt_values, self.justify) + + stringified.append(fmt_values) strcols = stringified if self.index: strcols.insert(0, str_index) + + # Add ... to signal truncated + truncate_h = self.truncate_h + truncate_v = self.truncate_v + if truncate_h: - strcols.append(([''] * len(str_columns[-1])) - + (['...'] * min(len(self.frame), self.max_rows))) + col_num = self.tr_col_num + col_width = len(strcols[col_num][0]) # infer from column header + strcols.insert(col_num + 1, ['...'.center(col_width)] * (len(str_index))) + if truncate_v: + n_header_rows = len(str_index) - len(frame) + row_num = self.tr_row_num + for ix,col in enumerate(strcols): + cwidth = len(strcols[ix][row_num]) # infer from above row + is_dot_col = False + if truncate_h: + is_dot_col = ix == col_num + 1 + if cwidth > 3 or is_dot_col: + my_str = '...' + else: + my_str = '..' + + if ix == 0: + dot_str = my_str.ljust(cwidth) + elif is_dot_col: + dot_str = my_str.center(cwidth) + else: + dot_str = my_str.rjust(cwidth) + + strcols[ix].insert(row_num + n_header_rows, dot_str) return strcols @@ -510,9 +570,10 @@ def write(buf, frame, column_format, strcols, longtable=False): 'method') def _format_col(self, i): + frame = self.tr_frame formatter = self._get_formatter(i) return format_array( - (self.frame.iloc[:self.max_rows_displayed, i]).get_values(), + (frame.iloc[:, i]).get_values(), formatter, float_format=self.float_format, na_rep=self.na_rep, space=self.col_space ) @@ -533,16 +594,13 @@ def to_html(self, classes=None): raise TypeError('buf is not a file name and it has no write ' ' method') - def _get_formatted_column_labels(self): + def _get_formatted_column_labels(self,frame): from pandas.core.index import _sparsify def is_numeric_dtype(dtype): return issubclass(dtype.type, np.number) - if self.max_cols: - columns = self.columns[:self.max_cols] - else: - columns = self.columns + columns = frame.columns if isinstance(columns, MultiIndex): fmt_columns = columns.format(sparsify=False, adjoin=False) @@ -580,13 +638,10 @@ def has_index_names(self): def has_column_names(self): return _has_names(self.frame.columns) - def _get_formatted_index(self): + def _get_formatted_index(self,frame): # Note: this is only used by to_string(), not by to_html(). - if self.max_rows: - index = self.frame.index[:self.max_rows] - else: - index = self.frame.index - columns = self.frame.columns + index = frame.index + columns = frame.columns show_index_names = self.show_index_names and self.has_index_names show_col_names = (self.show_index_names and self.has_column_names) @@ -633,7 +688,7 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None): self.classes = classes self.frame = self.fmt.frame - self.columns = formatter.columns + self.columns = self.fmt.tr_frame.columns self.elements = [] self.bold_rows = self.fmt.kwds.get('bold_rows', False) self.escape = self.fmt.kwds.get('escape', True) @@ -724,6 +779,7 @@ def write_result(self, buf): _put_lines(buf, self.elements) def _write_header(self, indent): + truncate_h = self.fmt.truncate_h if not self.fmt.header: # write nothing return indent @@ -745,9 +801,7 @@ def _column_header(): else: if self.fmt.index: row.append(self.columns.name or '') - row.extend(self.columns[:self.max_cols]) - if len(self.columns) > self.max_cols: - row.append('') + row.extend(self.columns) return row self.write('', indent) @@ -758,16 +812,13 @@ def _column_header(): if isinstance(self.columns, MultiIndex): template = 'colspan="%d" halign="left"' - # GH3547 - sentinel = com.sentinel_factory() - levels = self.columns.format(sparsify=sentinel, adjoin=False, - names=False) - # Truncate column names - if len(levels[0]) > self.max_cols: - levels = [lev[:self.max_cols] for lev in levels] - truncated = True + if self.fmt.sparsify: + # GH3547 + sentinel = com.sentinel_factory() else: - truncated = False + sentinel = None + levels = self.columns.format(sparsify=sentinel, + adjoin=False, names=False) level_lengths = _get_level_lengths(levels, sentinel) @@ -778,7 +829,6 @@ def _column_header(): name = self.columns.names[lnum] row = [''] * (row_levels - 1) + ['' if name is None else com.pprint_thing(name)] - tags = {} j = len(row) for i, v in enumerate(values): @@ -789,9 +839,16 @@ def _column_header(): continue j += 1 row.append(v) - - if truncated: - row.append('') + if truncate_h: + if self.fmt.sparsify and lnum == 0: + ins_col = row_levels + self.fmt.tr_col_num - 1 + row.insert(ins_col, '...') + + for tag in list(tags.keys()): + if tag >= ins_col: + tags[tag+1] = tags.pop(tag) + else: + row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) @@ -799,6 +856,9 @@ def _column_header(): col_row = _column_header() align = self.fmt.justify + if truncate_h: + col_row.insert(self.fmt.tr_col_num + 1, '...') + self.write_tr(col_row, indent, self.indent_delta, header=True, align=align) @@ -820,14 +880,13 @@ def _write_body(self, indent): fmt_values = {} for i in range(min(len(self.columns), self.max_cols)): fmt_values[i] = self.fmt._format_col(i) - truncated = (len(self.columns) > self.max_cols) # write values if self.fmt.index: if isinstance(self.frame.index, MultiIndex): self._write_hierarchical_rows(fmt_values, indent) else: - self._write_regular_rows(fmt_values, indent, truncated) + self._write_regular_rows(fmt_values, indent) else: for i in range(len(self.frame)): row = [fmt_values[j][i] for j in range(len(self.columns))] @@ -839,55 +898,62 @@ def _write_body(self, indent): return indent - def _write_regular_rows(self, fmt_values, indent, truncated): - ncols = min(len(self.columns), self.max_cols) - nrows = min(len(self.frame), self.max_rows) + def _write_regular_rows(self, fmt_values, indent): + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + + ncols = len(self.fmt.tr_frame.columns) + nrows = len(self.fmt.tr_frame) fmt = self.fmt._get_formatter('__index__') if fmt is not None: - index_values = self.frame.index[:nrows].map(fmt) + index_values = self.fmt.tr_frame.index.map(fmt) else: - index_values = self.frame.index[:nrows].format() + index_values = self.fmt.tr_frame.index.format() for i in range(nrows): + + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = [ '...' for ele in row ] + self.write_tr(str_sep_row, indent, self.indent_delta, tags=None, + nindex_levels=1) + row = [] row.append(index_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) - if truncated: - row.append('...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=1) - if len(self.frame) > self.max_rows: - row = [''] + (['...'] * ncols) + if truncate_h: + dot_col_ix = self.fmt.tr_col_num + 1 + row.insert(dot_col_ix, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=1) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="%d" valign="top"' - frame = self.frame - ncols = min(len(self.columns), self.max_cols) - nrows = min(len(self.frame), self.max_rows) - - truncate = (len(frame) > self.max_rows) + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + frame = self.fmt.tr_frame + ncols = len(frame.columns) + nrows = len(frame) + row_levels = self.frame.index.nlevels - idx_values = frame.index[:nrows].format(sparsify=False, adjoin=False, + idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: - # GH3547 sentinel = com.sentinel_factory() - levels = frame.index[:nrows].format(sparsify=sentinel, + levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) - # Truncate row names - if truncate: - levels = [lev[:self.max_rows] for lev in levels] level_lengths = _get_level_lengths(levels, sentinel) - for i in range(min(len(frame), self.max_rows)): + for i in range(nrows): + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = [ '...' ] * (len(row) + sparse_offset) + self.write_tr(str_sep_row, indent, self.indent_delta, tags=None) + row = [] tags = {} @@ -905,6 +971,8 @@ def _write_hierarchical_rows(self, fmt_values, indent): row.append(v) row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=tags, nindex_levels=len(levels) - sparse_offset) else: @@ -915,15 +983,11 @@ def _write_hierarchical_rows(self, fmt_values, indent): row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=frame.index.nlevels) - # Truncation markers (...) - if truncate: - row = ([''] * frame.index.nlevels) + (['...'] * ncols) - self.write_tr(row, indent, self.indent_delta, tags=None) - - def _get_level_lengths(levels, sentinel=''): from itertools import groupby @@ -1877,8 +1941,7 @@ def impl(x): return impl -def _make_fixed_width(strings, justify='right', minimum=None, truncated=False): - +def _make_fixed_width(strings, justify='right', minimum=None): if len(strings) == 0 or justify == 'all': return strings @@ -1909,9 +1972,6 @@ def just(x): result = [just(x) for x in strings] - if truncated: - result.append(justfunc('...'[:max_len], max_len)) - return result diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index f61bda686c88b..61d2de458fdc9 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -1,5 +1,6 @@ from __future__ import print_function # -*- coding: utf-8 -*- +import re from pandas.compat import range, zip, lrange, StringIO, PY3, lzip, u import pandas.compat as compat @@ -45,12 +46,25 @@ def has_non_verbose_info_repr(df): return has_info and nv def has_horizontally_truncated_repr(df): + try: # Check header row + fst_line = np.array(repr(df).splitlines()[0].split()) + cand_col = np.where(fst_line=='...')[0][0] + except: + return False + # Make sure each row has this ... in the same place r = repr(df) - return any(l.strip().endswith('...') for l in r.splitlines()) + for ix,l in enumerate(r.splitlines()): + if not r.split()[cand_col] == '...': + return False + return True def has_vertically_truncated_repr(df): r = repr(df) - return '..' in r.splitlines()[-3] + only_dot_row = False + for row in r.splitlines(): + if re.match('^[\.\ ]+$',row): + only_dot_row = True + return only_dot_row def has_truncated_repr(df): return has_horizontally_truncated_repr(df) or has_vertically_truncated_repr(df) @@ -382,6 +396,40 @@ def test_to_string_with_col_space(self): c30 = len(df.to_string(col_space=30).split("\n")[1]) self.assertTrue(c10 < c20 < c30) + def test_to_string_truncate_indices(self): + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + for column in [ tm.makeStringIndex ]: + for h in [10,20]: + for w in [10,20]: + with option_context("display.expand_frame_repr",False): + df = DataFrame(index=index(h), columns=column(w)) + with option_context("display.max_rows", 15): + if h == 20: + self.assertTrue(has_vertically_truncated_repr(df)) + else: + self.assertFalse(has_vertically_truncated_repr(df)) + with option_context("display.max_columns", 15): + if w == 20: + print(df) + print(repr(df)) + self.assertTrue(has_horizontally_truncated_repr(df)) + else: + self.assertFalse(has_horizontally_truncated_repr(df)) + with option_context("display.max_rows", 15,"display.max_columns", 15): + if h == 20 and w == 20: + self.assertTrue(has_doubly_truncated_repr(df)) + else: + self.assertFalse(has_doubly_truncated_repr(df)) + + def test_to_string_truncate_multilevel(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + with option_context("display.max_rows", 7,"display.max_columns", 7): + self.assertTrue(has_doubly_truncated_repr(df)) + + def test_to_html_with_col_space(self): def check_with_width(df, col_space): import re @@ -735,6 +783,338 @@ def test_to_html_regression_GH6098(self): # it works df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() + + + + + def test_to_html_truncate(self): + index = pd.DatetimeIndex(start='20010101',freq='D',periods=20) + df = pd.DataFrame(index=index,columns=range(20)) + fmt.set_option('display.max_rows',8) + fmt.set_option('display.max_columns',4) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01...1819
2001-01-01 NaN NaN... NaN NaN
2001-01-02 NaN NaN... NaN NaN
2001-01-03 NaN NaN... NaN NaN
2001-01-04 NaN NaN... NaN NaN
..................
2001-01-17 NaN NaN... NaN NaN
2001-01-18 NaN NaN... NaN NaN
2001-01-19 NaN NaN... NaN NaN
2001-01-20 NaN NaN... NaN NaN
+

20 rows × 20 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + fmt.set_option('display.max_rows',7) + fmt.set_option('display.max_columns',7) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbaz...fooqux
onetwoone...twoonetwo
barone NaN NaN NaN... NaN NaN NaN
two NaN NaN NaN... NaN NaN NaN
bazone NaN NaN NaN... NaN NaN NaN
...........................
footwo NaN NaN NaN... NaN NaN NaN
quxone NaN NaN NaN... NaN NaN NaN
two NaN NaN NaN... NaN NaN NaN
+

8 rows × 8 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index_sparse_off(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + fmt.set_option('display.max_rows',7) + fmt.set_option('display.max_columns',7) + fmt.set_option('display.multi_sparse',False) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbarbaz...fooquxqux
onetwoone...twoonetwo
barone NaN NaN NaN... NaN NaN NaN
bartwo NaN NaN NaN... NaN NaN NaN
bazone NaN NaN NaN... NaN NaN NaN
footwo NaN NaN NaN... NaN NaN NaN
quxone NaN NaN NaN... NaN NaN NaN
quxtwo NaN NaN NaN... NaN NaN NaN
+

8 rows × 8 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + + def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() @@ -1505,14 +1885,14 @@ def test_repr_html_long(self): h = max_rows - 1 df = pandas.DataFrame({'A':np.arange(1,1+h), 'B':np.arange(41, 41+h)}) reg_repr = df._repr_html_() - assert '...' not in reg_repr - assert str(40 + h) in reg_repr + assert '..' not in reg_repr + assert str(41 + max_rows // 2) in reg_repr h = max_rows + 1 df = pandas.DataFrame({'A':np.arange(1,1+h), 'B':np.arange(41, 41+h)}) long_repr = df._repr_html_() - assert '...' in long_repr - assert str(40 + h) not in long_repr + assert '..' in long_repr + assert str(41 + max_rows // 2) not in long_repr assert u('%d rows ') % h in long_repr assert u('2 columns') in long_repr @@ -1521,14 +1901,14 @@ def test_repr_html_float(self): h = max_rows - 1 df = pandas.DataFrame({'idx':np.linspace(-10,10,h), 'A':np.arange(1,1+h), 'B': np.arange(41, 41+h) }).set_index('idx') reg_repr = df._repr_html_() - assert '...' not in reg_repr + assert '..' not in reg_repr assert str(40 + h) in reg_repr h = max_rows + 1 df = pandas.DataFrame({'idx':np.linspace(-10,10,h), 'A':np.arange(1,1+h), 'B': np.arange(41, 41+h) }).set_index('idx') long_repr = df._repr_html_() - assert '...' in long_repr - assert str(40 + h) not in long_repr + assert '..' in long_repr + assert '31' not in long_repr assert u('%d rows ') % h in long_repr assert u('2 columns') in long_repr @@ -1575,7 +1955,7 @@ def test_info_repr(self): # Wide h, w = max_rows-1, max_cols+1 df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) - assert has_vertically_truncated_repr(df) + assert has_horizontally_truncated_repr(df) with option_context('display.large_repr', 'info'): assert has_info_repr(df)