diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eaab17513aaf4..687705640a467 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -92,8 +92,8 @@ import pandas.core.common as com import pandas.core.nanops as nanops import pandas.core.ops as ops -import pandas.io.formats.format as fmt import pandas.io.formats.console as console +import pandas.io.formats.format as fmt from pandas.io.formats.printing import pprint_thing import pandas.plotting._core as gfx @@ -1695,18 +1695,19 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, else: tupleize_cols = False - formatter = fmt.CSVFormatter(self, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - tupleize_cols=tupleize_cols, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) + from pandas.io.formats.csvs import CSVFormatter + formatter = CSVFormatter(self, path_or_buf, + line_terminator=line_terminator, sep=sep, + encoding=encoding, + compression=compression, quoting=quoting, + na_rep=na_rep, float_format=float_format, + cols=columns, header=header, index=index, + index_label=index_label, mode=mode, + chunksize=chunksize, quotechar=quotechar, + tupleize_cols=tupleize_cols, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, decimal=decimal) formatter.save() if path_or_buf is None: @@ -1997,7 +1998,6 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, - If False, never show counts. """ - from pandas.io.formats.format import _put_lines if buf is None: # pragma: no cover buf = sys.stdout @@ -2009,7 +2009,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, if len(self.columns) == 0: lines.append('Empty %s' % type(self).__name__) - _put_lines(buf, lines) + fmt.buffer_put_lines(buf, lines) return cols = self.columns @@ -2096,7 +2096,7 @@ def _sizeof_fmt(num, size_qualifier): mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % _sizeof_fmt(mem_usage, size_qualifier)) - _put_lines(buf, lines) + fmt.buffer_put_lines(buf, lines) def memory_usage(self, index=True, deep=False): """Memory usage of DataFrame columns. diff --git a/pandas/io/formats/common.py b/pandas/io/formats/common.py deleted file mode 100644 index 5cfdf58403cc0..0000000000000 --- a/pandas/io/formats/common.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Common helper methods used in different submodules of pandas.io.formats -""" - - -def get_level_lengths(levels, sentinel=''): - """For each index in each level the function returns lengths of indexes. - - Parameters - ---------- - levels : list of lists - List of values on for level. - sentinel : string, optional - Value which states that no new index starts on there. - - Returns - ---------- - Returns list of maps. For each level returns map of indexes (key is index - in row and value is length of index). - """ - if len(levels) == 0: - return [] - - control = [True for x in levels[0]] - - result = [] - for level in levels: - last_index = 0 - - lengths = {} - for i, key in enumerate(level): - if control[i] and key == sentinel: - pass - else: - control[i] = False - lengths[last_index] = i - last_index - last_index = i - - lengths[last_index] = len(level) - last_index - - result.append(lengths) - - return result diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py new file mode 100644 index 0000000000000..4e2021bcba72b --- /dev/null +++ b/pandas/io/formats/csvs.py @@ -0,0 +1,280 @@ +# -*- coding: utf-8 -*- +""" +Module for formatting output data into CSV files. +""" + +from __future__ import print_function + +import csv as csvlib +import numpy as np + +from pandas.core.dtypes.missing import notna +from pandas.core.index import Index, MultiIndex +from pandas import compat +from pandas.compat import (StringIO, range, zip) + +from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user, + _stringify_path) +from pandas._libs import writers as libwriters +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import PeriodIndex + + +class CSVFormatter(object): + + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', + float_format=None, cols=None, header=True, index=True, + index_label=None, mode='w', nanRep=None, encoding=None, + compression=None, quoting=None, line_terminator='\n', + chunksize=None, tupleize_cols=False, quotechar='"', + date_format=None, doublequote=True, escapechar=None, + decimal='.'): + + self.obj = obj + + if path_or_buf is None: + path_or_buf = StringIO() + + self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) + self.sep = sep + self.na_rep = na_rep + self.float_format = float_format + self.decimal = decimal + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + self.encoding = encoding + self.compression = compression + + if quoting is None: + quoting = csvlib.QUOTE_MINIMAL + self.quoting = quoting + + if quoting == csvlib.QUOTE_NONE: + # prevents crash in _csv + quotechar = None + self.quotechar = quotechar + + self.doublequote = doublequote + self.escapechar = escapechar + + self.line_terminator = line_terminator + + self.date_format = date_format + + self.tupleize_cols = tupleize_cols + self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and + not self.tupleize_cols) + + # validate mi options + if self.has_mi_columns: + if cols is not None: + raise TypeError("cannot specify cols with a MultiIndex on the " + "columns") + + if cols is not None: + if isinstance(cols, Index): + cols = cols.to_native_types(na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting) + else: + cols = list(cols) + self.obj = self.obj.loc[:, cols] + + # update columns to include possible multiplicity of dupes + # and make sure sure cols is just a list of labels + cols = self.obj.columns + if isinstance(cols, Index): + cols = cols.to_native_types(na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting) + else: + cols = list(cols) + + # save it + self.cols = cols + + # preallocate data 2d list + self.blocks = self.obj._data.blocks + ncols = sum(b.shape[0] for b in self.blocks) + self.data = [None] * ncols + + if chunksize is None: + chunksize = (100000 // (len(self.cols) or 1)) or 1 + self.chunksize = int(chunksize) + + self.data_index = obj.index + if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and + date_format is not None): + self.data_index = Index([x.strftime(date_format) if notna(x) else + '' for x in self.data_index]) + + self.nlevels = getattr(self.data_index, 'nlevels', 1) + if not index: + self.nlevels = 0 + + def save(self): + # create the writer & save + if self.encoding is None: + if compat.PY2: + encoding = 'ascii' + else: + encoding = 'utf-8' + else: + encoding = self.encoding + + if hasattr(self.path_or_buf, 'write'): + f = self.path_or_buf + close = False + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + close = True + + try: + writer_kwargs = dict(lineterminator=self.line_terminator, + delimiter=self.sep, quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar) + if encoding == 'ascii': + self.writer = csvlib.writer(f, **writer_kwargs) + else: + writer_kwargs['encoding'] = encoding + self.writer = UnicodeWriter(f, **writer_kwargs) + + self._save() + + finally: + if close: + f.close() + + def _save_header(self): + + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + has_mi_columns = self.has_mi_columns + header = self.header + encoded_labels = [] + + has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) + if not (has_aliases or self.header): + return + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing {ncols} cols but got {nalias} ' + 'aliases'.format(ncols=len(cols), + nalias=len(header)))) + else: + write_cols = header + else: + write_cols = cols + + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, + (list, tuple, np.ndarray, Index)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if not has_mi_columns or has_aliases: + encoded_labels += list(write_cols) + writer.writerow(encoded_labels) + else: + # write out the mi + columns = obj.columns + + # write out the names for each level, then ALL of the values for + # each level + for i in range(columns.nlevels): + + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(index_label, list) and len(index_label) > 1: + col_line.extend([''] * (len(index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + + writer.writerow(col_line) + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if encoded_labels and set(encoded_labels) != set(['']): + encoded_labels.extend([''] * len(columns)) + writer.writerow(encoded_labels) + + def _save(self): + + self._save_header() + + nrows = len(self.data_index) + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize) + 1 + + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i, end_i): + + data_index = self.data_index + + # create the data for a chunk + slicer = slice(start_i, end_i) + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting) + + for col_loc, col in zip(b.mgr_locs, d): + # self.data is a preallocated list + self.data[col_loc] = col + + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting) + + libwriters.write_csv_rows(self.data, ix, self.nlevels, + self.cols, self.writer) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 81e8881f3f06b..76ffd41f93090 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes import missing from pandas import Index, MultiIndex, PeriodIndex -from pandas.io.formats.common import get_level_lengths +from pandas.io.formats.format import get_level_lengths class ExcelCell(object): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 50b4f11634b78..1731dbb3ac68d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,11 +5,8 @@ """ from __future__ import print_function -from distutils.version import LooseVersion # pylint: disable=W0141 -from textwrap import dedent - from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -30,15 +27,14 @@ import pandas.core.common as com from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat -from pandas.compat import (StringIO, lzip, range, map, zip, u, - OrderedDict, unichr) +from pandas.compat import (StringIO, lzip, map, zip, u) + from pandas.io.formats.terminal import get_terminal_size from pandas.core.config import get_option, set_option -from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user, - _stringify_path) +from pandas.io.common import (_expand_user, _stringify_path) from pandas.io.formats.printing import adjoin, justify, pprint_thing -from pandas.io.formats.common import get_level_lengths -from pandas._libs import lib, writers as libwriters +from pandas._libs import lib + from pandas._libs.tslib import (iNaT, Timestamp, Timedelta, format_array_from_datetime) from pandas.core.indexes.datetimes import DatetimeIndex @@ -46,7 +42,6 @@ import pandas as pd import numpy as np -import csv from functools import partial common_docstring = """ @@ -354,6 +349,7 @@ def _get_adjustment(): class TableFormatter(object): + is_truncated = False show_dimensions = None @@ -698,6 +694,7 @@ def to_latex(self, column_format=None, longtable=False, encoding=None, Render a DataFrame to a LaTeX tabular/longtable environment output. """ + from pandas.io.formats.latex import LatexFormatter latex_renderer = LatexFormatter(self, column_format=column_format, longtable=longtable, multicolumn=multicolumn, @@ -742,6 +739,7 @@ def to_html(self, classes=None, notebook=False, border=None): .. versionadded:: 0.19.0 """ + from pandas.io.formats.html import HTMLFormatter html_renderer = HTMLFormatter(self, classes=classes, max_rows=self.max_rows, max_cols=self.max_cols, @@ -851,964 +849,6 @@ def _get_column_name_list(self): names.append('' if columns.name is None else columns.name) return names - -class LatexFormatter(TableFormatter): - """ Used to render a DataFrame to a LaTeX tabular/longtable environment - output. - - Parameters - ---------- - formatter : `DataFrameFormatter` - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - longtable : boolean, default False - Use a longtable environment instead of tabular. - - See also - -------- - HTMLFormatter - """ - - def __init__(self, formatter, column_format=None, longtable=False, - multicolumn=False, multicolumn_format=None, multirow=False): - self.fmt = formatter - self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get('bold_rows', False) - self.column_format = column_format - self.longtable = longtable - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - - def write_result(self, buf): - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - - # string representation of the columns - if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}') - .format(name=type(self.frame).__name__, - col=self.frame.columns, - idx=self.frame.index)) - strcols = [[info_line]] - else: - strcols = self.fmt._to_str_columns() - - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return 'r' - else: - return 'l' - - # reestablish the MultiIndex that has been joined by _to_str_column - if self.fmt.index and isinstance(self.frame.index, MultiIndex): - clevels = self.frame.columns.nlevels - strcols.pop(0) - name = any(self.frame.index.names) - cname = any(self.frame.columns.names) - lastcol = self.frame.index.nlevels - 1 - previous_lev3 = None - for i, lev in enumerate(self.frame.index.levels): - lev2 = lev.format() - blank = ' ' * len(lev2[0]) - # display column names in last index-column - if cname and i == lastcol: - lev3 = [x if x else '{}' for x in self.frame.columns.names] - else: - lev3 = [blank] * clevels - if name: - lev3.append(lev.name) - current_idx_val = None - for level_idx in self.frame.index.labels[i]: - if ((previous_lev3 is None or - previous_lev3[len(lev3)].isspace()) and - lev2[level_idx] == current_idx_val): - # same index as above row and left index was the same - lev3.append(blank) - else: - # different value than above or left index different - lev3.append(lev2[level_idx]) - current_idx_val = lev2[level_idx] - strcols.insert(i, lev3) - previous_lev3 = lev3 - - column_format = self.column_format - if column_format is None: - dtypes = self.frame.dtypes._values - column_format = ''.join(map(get_col_type, dtypes)) - if self.fmt.index: - index_format = 'l' * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(column_format, - compat.string_types): # pragma: no cover - raise AssertionError('column_format must be str or unicode, ' - 'not {typ}'.format(typ=type(column_format))) - - if not self.longtable: - buf.write('\\begin{{tabular}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') - else: - buf.write('\\begin{{longtable}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') - - ilevels = self.frame.index.nlevels - clevels = self.frame.columns.nlevels - nlevels = clevels - if any(self.frame.index.names): - nlevels += 1 - strrows = list(zip(*strcols)) - self.clinebuf = [] - - for i, row in enumerate(strrows): - if i == nlevels and self.fmt.header: - buf.write('\\midrule\n') # End of header - if self.longtable: - buf.write('\\endhead\n') - buf.write('\\midrule\n') - buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' - 'page}}}} \\\\\n'.format(n=len(row))) - buf.write('\\midrule\n') - buf.write('\\endfoot\n\n') - buf.write('\\bottomrule\n') - buf.write('\\endlastfoot\n') - if self.fmt.kwds.get('escape', True): - # escape backslashes first - crow = [(x.replace('\\', '\\textbackslash').replace('_', '\\_') - .replace('%', '\\%').replace('$', '\\$') - .replace('#', '\\#').replace('{', '\\{') - .replace('}', '\\}').replace('~', '\\textasciitilde') - .replace('^', '\\textasciicircum').replace('&', '\\&') - if (x and x != '{}') else '{}') for x in row] - else: - crow = [x if x else '{}' for x in row] - if self.bold_rows and self.fmt.index: - # bold row labels - crow = ['\\textbf{{{x}}}'.format(x=x) - if j < ilevels and x.strip() not in ['', '{}'] else x - for j, x in enumerate(crow)] - if i < clevels and self.fmt.header and self.multicolumn: - # sum up columns to multicolumns - crow = self._format_multicolumn(crow, ilevels) - if (i >= nlevels and self.fmt.index and self.multirow and - ilevels > 1): - # sum up rows to multirows - crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(' & '.join(crow)) - buf.write(' \\\\\n') - if self.multirow and i < len(strrows) - 1: - self._print_cline(buf, i, len(strcols)) - - if not self.longtable: - buf.write('\\bottomrule\n') - buf.write('\\end{tabular}\n') - else: - buf.write('\\end{longtable}\n') - - def _format_multicolumn(self, row, ilevels): - r""" - Combine columns belonging to a group to a single multicolumn entry - according to self.multicolumn_format - - e.g.: - a & & & b & c & - will become - \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} - """ - row2 = list(row[:ilevels]) - ncol = 1 - coltext = '' - - def append_col(): - # write multicolumn if needed - if ncol > 1: - row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}' - .format(ncol=ncol, fmt=self.multicolumn_format, - txt=coltext.strip())) - # don't modify where not needed - else: - row2.append(coltext) - for c in row[ilevels:]: - # if next col has text, write the previous - if c.strip(): - if coltext: - append_col() - coltext = c - ncol = 1 - # if not, add it to the previous multicolumn - else: - ncol += 1 - # write last column name - if coltext: - append_col() - return row2 - - def _format_multirow(self, row, ilevels, i, rows): - r""" - Check following rows, whether row should be a multirow - - e.g.: becomes: - a & 0 & \multirow{2}{*}{a} & 0 & - & 1 & & 1 & - b & 0 & \cline{1-2} - b & 0 & - """ - for j in range(ilevels): - if row[j].strip(): - nrow = 1 - for r in rows[i + 1:]: - if not r[j].strip(): - nrow += 1 - else: - break - if nrow > 1: - # overwrite non-multirow entry - row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format( - nrow=nrow, row=row[j].strip()) - # save when to end the current block with \cline - self.clinebuf.append([i + nrow - 1, j + 1]) - return row - - def _print_cline(self, buf, i, icol): - """ - Print clines after multirow-blocks are finished - """ - for cl in self.clinebuf: - if cl[0] == i: - buf.write('\\cline{{{cl:d}-{icol:d}}}\n' - .format(cl=cl[1], icol=icol)) - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] - - -class HTMLFormatter(TableFormatter): - - indent_delta = 2 - - def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False, border=None, table_id=None): - self.fmt = formatter - self.classes = classes - - self.frame = self.fmt.frame - self.columns = self.fmt.tr_frame.columns - self.elements = [] - self.bold_rows = self.fmt.kwds.get('bold_rows', False) - self.escape = self.fmt.kwds.get('escape', True) - - self.max_rows = max_rows or len(self.fmt.frame) - self.max_cols = max_cols or len(self.fmt.columns) - self.show_dimensions = self.fmt.show_dimensions - self.is_truncated = (self.max_rows < len(self.fmt.frame) or - self.max_cols < len(self.fmt.columns)) - self.notebook = notebook - if border is None: - border = get_option('display.html.border') - self.border = border - self.table_id = table_id - - def write(self, s, indent=0): - rs = pprint_thing(s) - self.elements.append(' ' * indent + rs) - - def write_th(self, s, indent=0, tags=None): - if self.fmt.col_space is not None and self.fmt.col_space > 0: - tags = (tags or "") - tags += ('style="min-width: {colspace};"' - .format(colspace=self.fmt.col_space)) - - return self._write_cell(s, kind='th', indent=indent, tags=tags) - - def write_td(self, s, indent=0, tags=None): - return self._write_cell(s, kind='td', indent=indent, tags=tags) - - def _write_cell(self, s, kind='td', indent=0, tags=None): - if tags is not None: - start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) - else: - start_tag = '<{kind}>'.format(kind=kind) - - if self.escape: - # escape & first to prevent double escaping of & - esc = OrderedDict([('&', r'&'), ('<', r'<'), - ('>', r'>')]) - else: - esc = {} - rs = pprint_thing(s, escape_chars=esc).strip() - self.write(u'{start}{rs}' - .format(start=start_tag, rs=rs, kind=kind), indent) - - def write_tr(self, line, indent=0, indent_delta=4, header=False, - align=None, tags=None, nindex_levels=0): - if tags is None: - tags = {} - - if align is None: - self.write('', indent) - else: - self.write('' - .format(align=align), indent) - indent += indent_delta - - for i, s in enumerate(line): - val_tag = tags.get(i, None) - if header or (self.bold_rows and i < nindex_levels): - self.write_th(s, indent, tags=val_tag) - else: - self.write_td(s, indent, tags=val_tag) - - indent -= indent_delta - self.write('', indent) - - def write_style(self): - # We use the "scoped" attribute here so that the desired - # style properties for the data frame are not then applied - # throughout the entire notebook. - template_first = """\ - """ - template_select = """\ - .dataframe %s { - %s: %s; - }""" - element_props = [('tbody tr th:only-of-type', - 'vertical-align', - 'middle'), - ('tbody tr th', - 'vertical-align', - 'top')] - if isinstance(self.columns, MultiIndex): - element_props.append(('thead tr th', - 'text-align', - 'left')) - if all((self.fmt.has_index_names, - self.fmt.index, - self.fmt.show_index_names)): - element_props.append(('thead tr:last-of-type th', - 'text-align', - 'right')) - else: - element_props.append(('thead th', - 'text-align', - 'right')) - template_mid = '\n\n'.join(map(lambda t: template_select % t, - element_props)) - template = dedent('\n'.join((template_first, - template_mid, - template_last))) - if self.notebook: - self.write(template) - - def write_result(self, buf): - indent = 0 - id_section = "" - frame = self.frame - - _classes = ['dataframe'] # Default class. - use_mathjax = get_option("display.html.use_mathjax") - if not use_mathjax: - _classes.append('tex2jax_ignore') - if self.classes is not None: - if isinstance(self.classes, str): - self.classes = self.classes.split() - if not isinstance(self.classes, (list, tuple)): - raise AssertionError('classes must be list or tuple, not {typ}' - .format(typ=type(self.classes))) - _classes.extend(self.classes) - - if self.notebook: - div_style = '' - try: - import IPython - if IPython.__version__ < LooseVersion('3.0.0'): - div_style = ' style="max-width:1500px;overflow:auto;"' - except (ImportError, AttributeError): - pass - - self.write(''.format(style=div_style)) - - self.write_style() - - if self.table_id is not None: - id_section = ' id="{table_id}"'.format(table_id=self.table_id) - self.write('' - .format(border=self.border, cls=' '.join(_classes), - id_section=id_section), indent) - - indent += self.indent_delta - indent = self._write_header(indent) - indent = self._write_body(indent) - - self.write('
', indent) - if self.should_show_dimensions: - by = chr(215) if compat.PY3 else unichr(215) # × - self.write(u('

{rows} rows {by} {cols} columns

') - .format(rows=len(frame), - by=by, - cols=len(frame.columns))) - - if self.notebook: - self.write('') - - _put_lines(buf, self.elements) - - def _write_header(self, indent): - truncate_h = self.fmt.truncate_h - row_levels = self.frame.index.nlevels - if not self.fmt.header: - # write nothing - return indent - - def _column_header(): - if self.fmt.index: - row = [''] * (self.frame.index.nlevels - 1) - else: - row = [] - - if isinstance(self.columns, MultiIndex): - if self.fmt.has_column_names and self.fmt.index: - row.append(single_column_table(self.columns.names)) - else: - row.append('') - style = "text-align: {just};".format(just=self.fmt.justify) - row.extend([single_column_table(c, self.fmt.justify, style) - for c in self.columns]) - else: - if self.fmt.index: - row.append(self.columns.name or '') - row.extend(self.columns) - return row - - self.write('', indent) - row = [] - - indent += self.indent_delta - - if isinstance(self.columns, MultiIndex): - template = 'colspan="{span:d}" halign="left"' - - if self.fmt.sparsify: - # GH3547 - sentinel = com.sentinel_factory() - else: - sentinel = None - levels = self.columns.format(sparsify=sentinel, adjoin=False, - names=False) - level_lengths = get_level_lengths(levels, sentinel) - inner_lvl = len(level_lengths) - 1 - for lnum, (records, values) in enumerate(zip(level_lengths, - levels)): - if truncate_h: - # modify the header lines - ins_col = self.fmt.tr_col_num - if self.fmt.sparsify: - recs_new = {} - # Increment tags after ... col. - for tag, span in list(records.items()): - if tag >= ins_col: - recs_new[tag + 1] = span - elif tag + span > ins_col: - recs_new[tag] = span + 1 - if lnum == inner_lvl: - values = (values[:ins_col] + (u('...'),) + - values[ins_col:]) - else: - # sparse col headers do not receive a ... - values = (values[:ins_col] + - (values[ins_col - 1], ) + - values[ins_col:]) - else: - recs_new[tag] = span - # if ins_col lies between tags, all col headers - # get ... - if tag + span == ins_col: - recs_new[ins_col] = 1 - values = (values[:ins_col] + (u('...'),) + - values[ins_col:]) - records = recs_new - inner_lvl = len(level_lengths) - 1 - if lnum == inner_lvl: - records[ins_col] = 1 - else: - recs_new = {} - for tag, span in list(records.items()): - if tag >= ins_col: - recs_new[tag + 1] = span - else: - recs_new[tag] = span - recs_new[ins_col] = 1 - records = recs_new - values = (values[:ins_col] + [u('...')] + - values[ins_col:]) - - name = self.columns.names[lnum] - row = [''] * (row_levels - 1) + ['' if name is None else - pprint_thing(name)] - - if row == [""] and self.fmt.index is False: - row = [] - - tags = {} - j = len(row) - for i, v in enumerate(values): - if i in records: - if records[i] > 1: - tags[j] = template.format(span=records[i]) - else: - continue - j += 1 - row.append(v) - self.write_tr(row, indent, self.indent_delta, tags=tags, - header=True) - else: - col_row = _column_header() - align = self.fmt.justify - - if truncate_h: - ins_col = row_levels + self.fmt.tr_col_num - col_row.insert(ins_col, '...') - - self.write_tr(col_row, indent, self.indent_delta, header=True, - align=align) - - if all((self.fmt.has_index_names, - self.fmt.index, - self.fmt.show_index_names)): - row = ([x if x is not None else '' - for x in self.frame.index.names] + - [''] * min(len(self.columns), self.max_cols)) - if truncate_h: - ins_col = row_levels + self.fmt.tr_col_num - row.insert(ins_col, '') - self.write_tr(row, indent, self.indent_delta, header=True) - - indent -= self.indent_delta - self.write('', indent) - - return indent - - def _write_body(self, indent): - self.write('', indent) - indent += self.indent_delta - - fmt_values = {} - for i in range(min(len(self.columns), self.max_cols)): - fmt_values[i] = self.fmt._format_col(i) - - # write values - if self.fmt.index: - if isinstance(self.frame.index, MultiIndex): - self._write_hierarchical_rows(fmt_values, indent) - else: - self._write_regular_rows(fmt_values, indent) - else: - for i in range(min(len(self.frame), self.max_rows)): - row = [fmt_values[j][i] for j in range(len(self.columns))] - self.write_tr(row, indent, self.indent_delta, tags=None) - - indent -= self.indent_delta - self.write('', indent) - indent -= self.indent_delta - - return indent - - def _write_regular_rows(self, fmt_values, indent): - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v - - ncols = len(self.fmt.tr_frame.columns) - nrows = len(self.fmt.tr_frame) - fmt = self.fmt._get_formatter('__index__') - if fmt is not None: - index_values = self.fmt.tr_frame.index.map(fmt) - else: - index_values = self.fmt.tr_frame.index.format() - - row = [] - for i in range(nrows): - - if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = ['...' for ele in row] - self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=1) - - row = [] - row.append(index_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) - - if truncate_h: - dot_col_ix = self.fmt.tr_col_num + 1 - row.insert(dot_col_ix, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=1) - - def _write_hierarchical_rows(self, fmt_values, indent): - template = 'rowspan="{span}" valign="top"' - - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v - frame = self.fmt.tr_frame - ncols = len(frame.columns) - nrows = len(frame) - row_levels = self.frame.index.nlevels - - idx_values = frame.index.format(sparsify=False, adjoin=False, - names=False) - idx_values = lzip(*idx_values) - - if self.fmt.sparsify: - # GH3547 - sentinel = com.sentinel_factory() - levels = frame.index.format(sparsify=sentinel, adjoin=False, - names=False) - - level_lengths = get_level_lengths(levels, sentinel) - inner_lvl = len(level_lengths) - 1 - if truncate_v: - # Insert ... row and adjust idx_values and - # level_lengths to take this into account. - ins_row = self.fmt.tr_row_num - inserted = False - for lnum, records in enumerate(level_lengths): - rec_new = {} - for tag, span in list(records.items()): - if tag >= ins_row: - rec_new[tag + 1] = span - elif tag + span > ins_row: - rec_new[tag] = span + 1 - - # GH 14882 - Make sure insertion done once - if not inserted: - dot_row = list(idx_values[ins_row - 1]) - dot_row[-1] = u('...') - idx_values.insert(ins_row, tuple(dot_row)) - inserted = True - else: - dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = u('...') - idx_values[ins_row] = tuple(dot_row) - else: - rec_new[tag] = span - # If ins_row lies between tags, all cols idx cols - # receive ... - if tag + span == ins_row: - rec_new[ins_row] = 1 - if lnum == 0: - idx_values.insert(ins_row, tuple( - [u('...')] * len(level_lengths))) - - # GH 14882 - Place ... in correct level - elif inserted: - dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = u('...') - idx_values[ins_row] = tuple(dot_row) - level_lengths[lnum] = rec_new - - level_lengths[inner_lvl][ins_row] = 1 - for ix_col in range(len(fmt_values)): - fmt_values[ix_col].insert(ins_row, '...') - nrows += 1 - - for i in range(nrows): - row = [] - tags = {} - - sparse_offset = 0 - j = 0 - for records, v in zip(level_lengths, idx_values[i]): - if i in records: - if records[i] > 1: - tags[j] = template.format(span=records[i]) - else: - sparse_offset += 1 - continue - - j += 1 - row.append(v) - - row.extend(fmt_values[j][i] for j in range(ncols)) - if truncate_h: - row.insert(row_levels - sparse_offset + - self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=tags, - nindex_levels=len(levels) - sparse_offset) - else: - for i in range(len(frame)): - idx_values = list(zip(*frame.index.format( - sparsify=False, adjoin=False, names=False))) - row = [] - row.extend(idx_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) - if truncate_h: - row.insert(row_levels + self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=frame.index.nlevels) - - -class CSVFormatter(object): - - def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', - float_format=None, cols=None, header=True, index=True, - index_label=None, mode='w', nanRep=None, encoding=None, - compression=None, quoting=None, line_terminator='\n', - chunksize=None, tupleize_cols=False, quotechar='"', - date_format=None, doublequote=True, escapechar=None, - decimal='.'): - - self.obj = obj - - if path_or_buf is None: - path_or_buf = StringIO() - - self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) - self.sep = sep - self.na_rep = na_rep - self.float_format = float_format - self.decimal = decimal - - self.header = header - self.index = index - self.index_label = index_label - self.mode = mode - self.encoding = encoding - self.compression = compression - - if quoting is None: - quoting = csv.QUOTE_MINIMAL - self.quoting = quoting - - if quoting == csv.QUOTE_NONE: - # prevents crash in _csv - quotechar = None - self.quotechar = quotechar - - self.doublequote = doublequote - self.escapechar = escapechar - - self.line_terminator = line_terminator - - self.date_format = date_format - - self.tupleize_cols = tupleize_cols - self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and - not self.tupleize_cols) - - # validate mi options - if self.has_mi_columns: - if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the " - "columns") - - if cols is not None: - if isinstance(cols, Index): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) - else: - cols = list(cols) - self.obj = self.obj.loc[:, cols] - - # update columns to include possible multiplicity of dupes - # and make sure sure cols is just a list of labels - cols = self.obj.columns - if isinstance(cols, Index): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) - else: - cols = list(cols) - - # save it - self.cols = cols - - # preallocate data 2d list - self.blocks = self.obj._data.blocks - ncols = sum(b.shape[0] for b in self.blocks) - self.data = [None] * ncols - - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - self.chunksize = int(chunksize) - - self.data_index = obj.index - if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and - date_format is not None): - self.data_index = Index([x.strftime(date_format) if notna(x) else - '' for x in self.data_index]) - - self.nlevels = getattr(self.data_index, 'nlevels', 1) - if not index: - self.nlevels = 0 - - def save(self): - # create the writer & save - if self.encoding is None: - if compat.PY2: - encoding = 'ascii' - else: - encoding = 'utf-8' - else: - encoding = self.encoding - - if hasattr(self.path_or_buf, 'write'): - f = self.path_or_buf - close = False - else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - close = True - - try: - writer_kwargs = dict(lineterminator=self.line_terminator, - delimiter=self.sep, quoting=self.quoting, - doublequote=self.doublequote, - escapechar=self.escapechar, - quotechar=self.quotechar) - if encoding == 'ascii': - self.writer = csv.writer(f, **writer_kwargs) - else: - writer_kwargs['encoding'] = encoding - self.writer = UnicodeWriter(f, **writer_kwargs) - - self._save() - - finally: - if close: - f.close() - - def _save_header(self): - - writer = self.writer - obj = self.obj - index_label = self.index_label - cols = self.cols - has_mi_columns = self.has_mi_columns - header = self.header - encoded_labels = [] - - has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) - if not (has_aliases or self.header): - return - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing {ncols} cols but got {nalias} ' - 'aliases'.format(ncols=len(cols), - nalias=len(header)))) - else: - write_cols = header - else: - write_cols = cols - - if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, MultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = '' - index_label.append(name) - else: - index_label = obj.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, - (list, tuple, np.ndarray, Index)): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if not has_mi_columns or has_aliases: - encoded_labels += list(write_cols) - writer.writerow(encoded_labels) - else: - # write out the mi - columns = obj.columns - - # write out the names for each level, then ALL of the values for - # each level - for i in range(columns.nlevels): - - # we need at least 1 index column to write our col names - col_line = [] - if self.index: - - # name is the first column - col_line.append(columns.names[i]) - - if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([''] * (len(index_label) - 1)) - - col_line.extend(columns._get_level_values(i)) - - writer.writerow(col_line) - - # Write out the index line if it's not empty. - # Otherwise, we will print out an extraneous - # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != set(['']): - encoded_labels.extend([''] * len(columns)) - writer.writerow(encoded_labels) - - def _save(self): - - self._save_header() - - nrows = len(self.data_index) - - # write in chunksize bites - chunksize = self.chunksize - chunks = int(nrows / chunksize) + 1 - - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, nrows) - if start_i >= end_i: - break - - self._save_chunk(start_i, end_i) - - def _save_chunk(self, start_i, end_i): - - data_index = self.data_index - - # create the data for a chunk - slicer = slice(start_i, end_i) - for i in range(len(self.blocks)): - b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) - - for col_loc, col in zip(b.mgr_locs, d): - # self.data is a preallocated list - self.data[col_loc] = col - - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) - - libwriters.write_csv_rows(self.data, ix, self.nlevels, - self.cols, self.writer) - - # ---------------------------------------------------------------------- # Array formatters @@ -2366,27 +1406,6 @@ def _cond(values): return [x + "0" if x.endswith('.') and x != na_rep else x for x in trimmed] -def single_column_table(column, align=None, style=None): - table = '{i!s}'.format(i=i)) - table += '' - return table - - -def single_row_table(row): # pragma: no cover - table = '' - for i in row: - table += (''.format(i=i)) - table += '
{i!s}
' - return table - - def _has_names(index): if isinstance(index, MultiIndex): return com._any_not_none(*index.names) @@ -2506,12 +1525,6 @@ def set_eng_float_format(accuracy=3, use_eng_prefix=False): set_option("display.column_space", max(12, accuracy + 9)) -def _put_lines(buf, lines): - if any(isinstance(x, compat.text_type) for x in lines): - lines = [compat.text_type(x) for x in lines] - buf.write('\n'.join(lines)) - - def _binify(cols, line_width): adjoin_width = 1 bins = [] @@ -2530,3 +1543,59 @@ def _binify(cols, line_width): bins.append(len(cols)) return bins + + +def get_level_lengths(levels, sentinel=''): + """For each index in each level the function returns lengths of indexes. + + Parameters + ---------- + levels : list of lists + List of values on for level. + sentinel : string, optional + Value which states that no new index starts on there. + + Returns + ---------- + Returns list of maps. For each level returns map of indexes (key is index + in row and value is length of index). + """ + if len(levels) == 0: + return [] + + control = [True for x in levels[0]] + + result = [] + for level in levels: + last_index = 0 + + lengths = {} + for i, key in enumerate(level): + if control[i] and key == sentinel: + pass + else: + control[i] = False + lengths[last_index] = i - last_index + last_index = i + + lengths[last_index] = len(level) - last_index + + result.append(lengths) + + return result + + +def buffer_put_lines(buf, lines): + """ + Appends lines to a buffer. + + Parameters + ---------- + buf + The buffer to write to + lines + The lines to append. + """ + if any(isinstance(x, compat.text_type) for x in lines): + lines = [compat.text_type(x) for x in lines] + buf.write('\n'.join(lines)) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py new file mode 100644 index 0000000000000..a43c55a220292 --- /dev/null +++ b/pandas/io/formats/html.py @@ -0,0 +1,506 @@ +# -*- coding: utf-8 -*- +""" +Module for formatting output data in HTML. +""" + +from __future__ import print_function +from distutils.version import LooseVersion + +from textwrap import dedent + +import pandas.core.common as com +from pandas.core.index import MultiIndex +from pandas import compat +from pandas.compat import (lzip, range, map, zip, u, + OrderedDict, unichr) +from pandas.core.config import get_option +from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.format import (get_level_lengths, + buffer_put_lines) +from pandas.io.formats.format import TableFormatter + + +class HTMLFormatter(TableFormatter): + + indent_delta = 2 + + def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, + notebook=False, border=None, table_id=None): + self.fmt = formatter + self.classes = classes + + self.frame = self.fmt.frame + self.columns = self.fmt.tr_frame.columns + self.elements = [] + self.bold_rows = self.fmt.kwds.get('bold_rows', False) + self.escape = self.fmt.kwds.get('escape', True) + + self.max_rows = max_rows or len(self.fmt.frame) + self.max_cols = max_cols or len(self.fmt.columns) + self.show_dimensions = self.fmt.show_dimensions + self.is_truncated = (self.max_rows < len(self.fmt.frame) or + self.max_cols < len(self.fmt.columns)) + self.notebook = notebook + if border is None: + border = get_option('display.html.border') + self.border = border + self.table_id = table_id + + def write(self, s, indent=0): + rs = pprint_thing(s) + self.elements.append(' ' * indent + rs) + + def write_th(self, s, indent=0, tags=None): + if self.fmt.col_space is not None and self.fmt.col_space > 0: + tags = (tags or "") + tags += ('style="min-width: {colspace};"' + .format(colspace=self.fmt.col_space)) + + return self._write_cell(s, kind='th', indent=indent, tags=tags) + + def write_td(self, s, indent=0, tags=None): + return self._write_cell(s, kind='td', indent=indent, tags=tags) + + def _write_cell(self, s, kind='td', indent=0, tags=None): + if tags is not None: + start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) + else: + start_tag = '<{kind}>'.format(kind=kind) + + if self.escape: + # escape & first to prevent double escaping of & + esc = OrderedDict([('&', r'&'), ('<', r'<'), + ('>', r'>')]) + else: + esc = {} + rs = pprint_thing(s, escape_chars=esc).strip() + self.write(u'{start}{rs}' + .format(start=start_tag, rs=rs, kind=kind), indent) + + def write_tr(self, line, indent=0, indent_delta=4, header=False, + align=None, tags=None, nindex_levels=0): + if tags is None: + tags = {} + + if align is None: + self.write('', indent) + else: + self.write('' + .format(align=align), indent) + indent += indent_delta + + for i, s in enumerate(line): + val_tag = tags.get(i, None) + if header or (self.bold_rows and i < nindex_levels): + self.write_th(s, indent, tags=val_tag) + else: + self.write_td(s, indent, tags=val_tag) + + indent -= indent_delta + self.write('', indent) + + def write_style(self): + # We use the "scoped" attribute here so that the desired + # style properties for the data frame are not then applied + # throughout the entire notebook. + template_first = """\ + """ + template_select = """\ + .dataframe %s { + %s: %s; + }""" + element_props = [('tbody tr th:only-of-type', + 'vertical-align', + 'middle'), + ('tbody tr th', + 'vertical-align', + 'top')] + if isinstance(self.columns, MultiIndex): + element_props.append(('thead tr th', + 'text-align', + 'left')) + if all((self.fmt.has_index_names, + self.fmt.index, + self.fmt.show_index_names)): + element_props.append(('thead tr:last-of-type th', + 'text-align', + 'right')) + else: + element_props.append(('thead th', + 'text-align', + 'right')) + template_mid = '\n\n'.join(map(lambda t: template_select % t, + element_props)) + template = dedent('\n'.join((template_first, + template_mid, + template_last))) + if self.notebook: + self.write(template) + + def write_result(self, buf): + indent = 0 + id_section = "" + frame = self.frame + + _classes = ['dataframe'] # Default class. + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + _classes.append('tex2jax_ignore') + if self.classes is not None: + if isinstance(self.classes, str): + self.classes = self.classes.split() + if not isinstance(self.classes, (list, tuple)): + raise AssertionError('classes must be list or tuple, not {typ}' + .format(typ=type(self.classes))) + _classes.extend(self.classes) + + if self.notebook: + div_style = '' + try: + import IPython + if IPython.__version__ < LooseVersion('3.0.0'): + div_style = ' style="max-width:1500px;overflow:auto;"' + except (ImportError, AttributeError): + pass + + self.write(''.format(style=div_style)) + + self.write_style() + + if self.table_id is not None: + id_section = ' id="{table_id}"'.format(table_id=self.table_id) + self.write('' + .format(border=self.border, cls=' '.join(_classes), + id_section=id_section), indent) + + indent += self.indent_delta + indent = self._write_header(indent) + indent = self._write_body(indent) + + self.write('
', indent) + if self.should_show_dimensions: + by = chr(215) if compat.PY3 else unichr(215) # × + self.write(u('

{rows} rows {by} {cols} columns

') + .format(rows=len(frame), + by=by, + cols=len(frame.columns))) + + if self.notebook: + self.write('') + + buffer_put_lines(buf, self.elements) + + def _write_header(self, indent): + truncate_h = self.fmt.truncate_h + row_levels = self.frame.index.nlevels + if not self.fmt.header: + # write nothing + return indent + + def _column_header(): + if self.fmt.index: + row = [''] * (self.frame.index.nlevels - 1) + else: + row = [] + + if isinstance(self.columns, MultiIndex): + if self.fmt.has_column_names and self.fmt.index: + row.append(single_column_table(self.columns.names)) + else: + row.append('') + style = "text-align: {just};".format(just=self.fmt.justify) + row.extend([single_column_table(c, self.fmt.justify, style) + for c in self.columns]) + else: + if self.fmt.index: + row.append(self.columns.name or '') + row.extend(self.columns) + return row + + self.write('', indent) + row = [] + + indent += self.indent_delta + + if isinstance(self.columns, MultiIndex): + template = 'colspan="{span:d}" halign="left"' + + if self.fmt.sparsify: + # GH3547 + sentinel = com.sentinel_factory() + else: + sentinel = None + levels = self.columns.format(sparsify=sentinel, adjoin=False, + names=False) + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + for lnum, (records, values) in enumerate(zip(level_lengths, + levels)): + if truncate_h: + # modify the header lines + ins_col = self.fmt.tr_col_num + if self.fmt.sparsify: + recs_new = {} + # Increment tags after ... col. + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + elif tag + span > ins_col: + recs_new[tag] = span + 1 + if lnum == inner_lvl: + values = (values[:ins_col] + (u('...'),) + + values[ins_col:]) + else: + # sparse col headers do not receive a ... + values = (values[:ins_col] + + (values[ins_col - 1], ) + + values[ins_col:]) + else: + recs_new[tag] = span + # if ins_col lies between tags, all col headers + # get ... + if tag + span == ins_col: + recs_new[ins_col] = 1 + values = (values[:ins_col] + (u('...'),) + + values[ins_col:]) + records = recs_new + inner_lvl = len(level_lengths) - 1 + if lnum == inner_lvl: + records[ins_col] = 1 + else: + recs_new = {} + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + else: + recs_new[tag] = span + recs_new[ins_col] = 1 + records = recs_new + values = (values[:ins_col] + [u('...')] + + values[ins_col:]) + + name = self.columns.names[lnum] + row = [''] * (row_levels - 1) + ['' if name is None else + pprint_thing(name)] + + if row == [""] and self.fmt.index is False: + row = [] + + tags = {} + j = len(row) + for i, v in enumerate(values): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + continue + j += 1 + row.append(v) + self.write_tr(row, indent, self.indent_delta, tags=tags, + header=True) + else: + col_row = _column_header() + align = self.fmt.justify + + if truncate_h: + ins_col = row_levels + self.fmt.tr_col_num + col_row.insert(ins_col, '...') + + self.write_tr(col_row, indent, self.indent_delta, header=True, + align=align) + + if all((self.fmt.has_index_names, + self.fmt.index, + self.fmt.show_index_names)): + row = ([x if x is not None else '' + for x in self.frame.index.names] + + [''] * min(len(self.columns), self.max_cols)) + if truncate_h: + ins_col = row_levels + self.fmt.tr_col_num + row.insert(ins_col, '') + self.write_tr(row, indent, self.indent_delta, header=True) + + indent -= self.indent_delta + self.write('', indent) + + return indent + + def _write_body(self, indent): + self.write('', indent) + indent += self.indent_delta + + fmt_values = {} + for i in range(min(len(self.columns), self.max_cols)): + fmt_values[i] = self.fmt._format_col(i) + + # write values + if self.fmt.index: + if isinstance(self.frame.index, MultiIndex): + self._write_hierarchical_rows(fmt_values, indent) + else: + self._write_regular_rows(fmt_values, indent) + else: + for i in range(min(len(self.frame), self.max_rows)): + row = [fmt_values[j][i] for j in range(len(self.columns))] + self.write_tr(row, indent, self.indent_delta, tags=None) + + indent -= self.indent_delta + self.write('', indent) + indent -= self.indent_delta + + return indent + + def _write_regular_rows(self, fmt_values, indent): + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + + ncols = len(self.fmt.tr_frame.columns) + nrows = len(self.fmt.tr_frame) + fmt = self.fmt._get_formatter('__index__') + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + + row = [] + for i in range(nrows): + + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = ['...' for ele in row] + self.write_tr(str_sep_row, indent, self.indent_delta, + tags=None, nindex_levels=1) + + row = [] + row.append(index_values[i]) + row.extend(fmt_values[j][i] for j in range(ncols)) + + if truncate_h: + dot_col_ix = self.fmt.tr_col_num + 1 + row.insert(dot_col_ix, '...') + self.write_tr(row, indent, self.indent_delta, tags=None, + nindex_levels=1) + + def _write_hierarchical_rows(self, fmt_values, indent): + template = 'rowspan="{span}" valign="top"' + + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + frame = self.fmt.tr_frame + ncols = len(frame.columns) + nrows = len(frame) + row_levels = self.frame.index.nlevels + + idx_values = frame.index.format(sparsify=False, adjoin=False, + names=False) + idx_values = lzip(*idx_values) + + if self.fmt.sparsify: + # GH3547 + sentinel = com.sentinel_factory() + levels = frame.index.format(sparsify=sentinel, adjoin=False, + names=False) + + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + if truncate_v: + # Insert ... row and adjust idx_values and + # level_lengths to take this into account. + ins_row = self.fmt.tr_row_num + inserted = False + for lnum, records in enumerate(level_lengths): + rec_new = {} + for tag, span in list(records.items()): + if tag >= ins_row: + rec_new[tag + 1] = span + elif tag + span > ins_row: + rec_new[tag] = span + 1 + + # GH 14882 - Make sure insertion done once + if not inserted: + dot_row = list(idx_values[ins_row - 1]) + dot_row[-1] = u('...') + idx_values.insert(ins_row, tuple(dot_row)) + inserted = True + else: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = u('...') + idx_values[ins_row] = tuple(dot_row) + else: + rec_new[tag] = span + # If ins_row lies between tags, all cols idx cols + # receive ... + if tag + span == ins_row: + rec_new[ins_row] = 1 + if lnum == 0: + idx_values.insert(ins_row, tuple( + [u('...')] * len(level_lengths))) + + # GH 14882 - Place ... in correct level + elif inserted: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = u('...') + idx_values[ins_row] = tuple(dot_row) + level_lengths[lnum] = rec_new + + level_lengths[inner_lvl][ins_row] = 1 + for ix_col in range(len(fmt_values)): + fmt_values[ix_col].insert(ins_row, '...') + nrows += 1 + + for i in range(nrows): + row = [] + tags = {} + + sparse_offset = 0 + j = 0 + for records, v in zip(level_lengths, idx_values[i]): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + sparse_offset += 1 + continue + + j += 1 + row.append(v) + + row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels - sparse_offset + + self.fmt.tr_col_num, '...') + self.write_tr(row, indent, self.indent_delta, tags=tags, + nindex_levels=len(levels) - sparse_offset) + else: + for i in range(len(frame)): + idx_values = list(zip(*frame.index.format( + sparsify=False, adjoin=False, names=False))) + row = [] + row.extend(idx_values[i]) + row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels + self.fmt.tr_col_num, '...') + self.write_tr(row, indent, self.indent_delta, tags=None, + nindex_levels=frame.index.nlevels) + + +def single_column_table(column, align=None, style=None): + table = '{i!s}'.format(i=i)) + table += '' + return table + + +def single_row_table(row): # pragma: no cover + table = '' + for i in row: + table += (''.format(i=i)) + table += '
{i!s}
' + return table diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py new file mode 100644 index 0000000000000..67b0a4f0e034e --- /dev/null +++ b/pandas/io/formats/latex.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +""" +Module for formatting output data in Latex. +""" + +from __future__ import print_function + +from pandas.core.index import MultiIndex +from pandas import compat +from pandas.compat import range, map, zip, u +from pandas.io.formats.format import TableFormatter +import numpy as np + + +class LatexFormatter(TableFormatter): + """ Used to render a DataFrame to a LaTeX tabular/longtable environment + output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + longtable : boolean, default False + Use a longtable environment instead of tabular. + + See Also + -------- + HTMLFormatter + """ + + def __init__(self, formatter, column_format=None, longtable=False, + multicolumn=False, multicolumn_format=None, multirow=False): + self.fmt = formatter + self.frame = self.fmt.frame + self.bold_rows = self.fmt.kwds.get('bold_rows', False) + self.column_format = column_format + self.longtable = longtable + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + + def write_result(self, buf): + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + + # string representation of the columns + if len(self.frame.columns) == 0 or len(self.frame.index) == 0: + info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}') + .format(name=type(self.frame).__name__, + col=self.frame.columns, + idx=self.frame.index)) + strcols = [[info_line]] + else: + strcols = self.fmt._to_str_columns() + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return 'r' + else: + return 'l' + + # reestablish the MultiIndex that has been joined by _to_str_column + if self.fmt.index and isinstance(self.frame.index, MultiIndex): + clevels = self.frame.columns.nlevels + strcols.pop(0) + name = any(self.frame.index.names) + cname = any(self.frame.columns.names) + lastcol = self.frame.index.nlevels - 1 + previous_lev3 = None + for i, lev in enumerate(self.frame.index.levels): + lev2 = lev.format() + blank = ' ' * len(lev2[0]) + # display column names in last index-column + if cname and i == lastcol: + lev3 = [x if x else '{}' for x in self.frame.columns.names] + else: + lev3 = [blank] * clevels + if name: + lev3.append(lev.name) + current_idx_val = None + for level_idx in self.frame.index.labels[i]: + if ((previous_lev3 is None or + previous_lev3[len(lev3)].isspace()) and + lev2[level_idx] == current_idx_val): + # same index as above row and left index was the same + lev3.append(blank) + else: + # different value than above or left index different + lev3.append(lev2[level_idx]) + current_idx_val = lev2[level_idx] + strcols.insert(i, lev3) + previous_lev3 = lev3 + + column_format = self.column_format + if column_format is None: + dtypes = self.frame.dtypes._values + column_format = ''.join(map(get_col_type, dtypes)) + if self.fmt.index: + index_format = 'l' * self.frame.index.nlevels + column_format = index_format + column_format + elif not isinstance(column_format, + compat.string_types): # pragma: no cover + raise AssertionError('column_format must be str or unicode, ' + 'not {typ}'.format(typ=type(column_format))) + + if not self.longtable: + buf.write('\\begin{{tabular}}{{{fmt}}}\n' + .format(fmt=column_format)) + buf.write('\\toprule\n') + else: + buf.write('\\begin{{longtable}}{{{fmt}}}\n' + .format(fmt=column_format)) + buf.write('\\toprule\n') + + ilevels = self.frame.index.nlevels + clevels = self.frame.columns.nlevels + nlevels = clevels + if any(self.frame.index.names): + nlevels += 1 + strrows = list(zip(*strcols)) + self.clinebuf = [] + + for i, row in enumerate(strrows): + if i == nlevels and self.fmt.header: + buf.write('\\midrule\n') # End of header + if self.longtable: + buf.write('\\endhead\n') + buf.write('\\midrule\n') + buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' + 'page}}}} \\\\\n'.format(n=len(row))) + buf.write('\\midrule\n') + buf.write('\\endfoot\n\n') + buf.write('\\bottomrule\n') + buf.write('\\endlastfoot\n') + if self.fmt.kwds.get('escape', True): + # escape backslashes first + crow = [(x.replace('\\', '\\textbackslash').replace('_', '\\_') + .replace('%', '\\%').replace('$', '\\$') + .replace('#', '\\#').replace('{', '\\{') + .replace('}', '\\}').replace('~', '\\textasciitilde') + .replace('^', '\\textasciicircum').replace('&', '\\&') + if (x and x != '{}') else '{}') for x in row] + else: + crow = [x if x else '{}' for x in row] + if self.bold_rows and self.fmt.index: + # bold row labels + crow = ['\\textbf{{{x}}}'.format(x=x) + if j < ilevels and x.strip() not in ['', '{}'] else x + for j, x in enumerate(crow)] + if i < clevels and self.fmt.header and self.multicolumn: + # sum up columns to multicolumns + crow = self._format_multicolumn(crow, ilevels) + if (i >= nlevels and self.fmt.index and self.multirow and + ilevels > 1): + # sum up rows to multirows + crow = self._format_multirow(crow, ilevels, i, strrows) + buf.write(' & '.join(crow)) + buf.write(' \\\\\n') + if self.multirow and i < len(strrows) - 1: + self._print_cline(buf, i, len(strcols)) + + if not self.longtable: + buf.write('\\bottomrule\n') + buf.write('\\end{tabular}\n') + else: + buf.write('\\end{longtable}\n') + + def _format_multicolumn(self, row, ilevels): + r""" + Combine columns belonging to a group to a single multicolumn entry + according to self.multicolumn_format + + e.g.: + a & & & b & c & + will become + \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} + """ + row2 = list(row[:ilevels]) + ncol = 1 + coltext = '' + + def append_col(): + # write multicolumn if needed + if ncol > 1: + row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}' + .format(ncol=ncol, fmt=self.multicolumn_format, + txt=coltext.strip())) + # don't modify where not needed + else: + row2.append(coltext) + for c in row[ilevels:]: + # if next col has text, write the previous + if c.strip(): + if coltext: + append_col() + coltext = c + ncol = 1 + # if not, add it to the previous multicolumn + else: + ncol += 1 + # write last column name + if coltext: + append_col() + return row2 + + def _format_multirow(self, row, ilevels, i, rows): + r""" + Check following rows, whether row should be a multirow + + e.g.: becomes: + a & 0 & \multirow{2}{*}{a} & 0 & + & 1 & & 1 & + b & 0 & \cline{1-2} + b & 0 & + """ + for j in range(ilevels): + if row[j].strip(): + nrow = 1 + for r in rows[i + 1:]: + if not r[j].strip(): + nrow += 1 + else: + break + if nrow > 1: + # overwrite non-multirow entry + row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format( + nrow=nrow, row=row[j].strip()) + # save when to end the current block with \cline + self.clinebuf.append([i + nrow - 1, j + 1]) + return row + + def _print_cline(self, buf, i, icol): + """ + Print clines after multirow-blocks are finished + """ + for cl in self.clinebuf: + if cl[0] == i: + buf.write('\\cline{{{cl:d}-{icol:d}}}\n' + .format(cl=cl[1], icol=icol)) + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i]