Skip to content

Preliminary format refactor #20341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Mar 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fb79b19
Separating the formatters into different files
shangyian Mar 8, 2018
d57630d
Fixing flake8 issues
shangyian Mar 8, 2018
d4bcb17
Changing _put_lines from protected function
shangyian Mar 8, 2018
dc0778a
Adding back CSVFormatter to be compatible
shangyian Mar 8, 2018
13ba22f
Some small changes based on feedback
shangyian Mar 8, 2018
fcfb87a
flake8 fixes
shangyian Mar 8, 2018
7b26f2c
changing csv import name or else conflicts
shangyian Mar 8, 2018
d6f4c8e
renaming because of naming conflicts otherwise
shangyian Mar 8, 2018
3afd771
Merge branch 'master' into PR_TOOL_MERGE_PR_20051
jreback Mar 10, 2018
f8abfec
clean up imports
jreback Mar 10, 2018
196220b
rename csv -> csvs
jreback Mar 10, 2018
6451eb5
Removing io.formats.common and moving functions to formats.format
shangyian Mar 10, 2018
1b4182a
Fixing imports
shangyian Mar 10, 2018
8e5fdec
Separating the formatters into different files
shangyian Mar 8, 2018
e114ba9
Fixing flake8 issues
shangyian Mar 8, 2018
b3e5ff4
Changing _put_lines from protected function
shangyian Mar 8, 2018
832d26e
Adding back CSVFormatter to be compatible
shangyian Mar 8, 2018
45f6aac
Some small changes based on feedback
shangyian Mar 8, 2018
fae177b
flake8 fixes
shangyian Mar 8, 2018
2cd9648
changing csv import name or else conflicts
shangyian Mar 8, 2018
691aa6c
renaming because of naming conflicts otherwise
shangyian Mar 8, 2018
28b66dc
clean up imports
jreback Mar 10, 2018
764533c
rename csv -> csvs
jreback Mar 10, 2018
719b02c
Removing io.formats.common and moving functions to formats.format
shangyian Mar 10, 2018
de39e22
Fixing imports
shangyian Mar 10, 2018
c9b4504
Merge branch 'prelim_format_refactor' of github.com:shangyian/pandas …
shangyian Mar 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@
import pandas.core.common as com
import pandas.core.nanops as nanops
import pandas.core.ops as ops
import pandas.io.formats.format as fmt
import pandas.io.formats.console as console
import pandas.io.formats.format as fmt
from pandas.io.formats.printing import pprint_thing
import pandas.plotting._core as gfx

Expand Down Expand Up @@ -1695,18 +1695,19 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
else:
tupleize_cols = False

formatter = fmt.CSVFormatter(self, path_or_buf,
line_terminator=line_terminator, sep=sep,
encoding=encoding,
compression=compression, quoting=quoting,
na_rep=na_rep, float_format=float_format,
cols=columns, header=header, index=index,
index_label=index_label, mode=mode,
chunksize=chunksize, quotechar=quotechar,
tupleize_cols=tupleize_cols,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar, decimal=decimal)
from pandas.io.formats.csvs import CSVFormatter
formatter = CSVFormatter(self, path_or_buf,
line_terminator=line_terminator, sep=sep,
encoding=encoding,
compression=compression, quoting=quoting,
na_rep=na_rep, float_format=float_format,
cols=columns, header=header, index=index,
index_label=index_label, mode=mode,
chunksize=chunksize, quotechar=quotechar,
tupleize_cols=tupleize_cols,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar, decimal=decimal)
formatter.save()

if path_or_buf is None:
Expand Down Expand Up @@ -1997,7 +1998,6 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
- If False, never show counts.

"""
from pandas.io.formats.format import _put_lines

if buf is None: # pragma: no cover
buf = sys.stdout
Expand All @@ -2009,7 +2009,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,

if len(self.columns) == 0:
lines.append('Empty %s' % type(self).__name__)
_put_lines(buf, lines)
fmt.buffer_put_lines(buf, lines)
return

cols = self.columns
Expand Down Expand Up @@ -2096,7 +2096,7 @@ def _sizeof_fmt(num, size_qualifier):
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: %s\n" %
_sizeof_fmt(mem_usage, size_qualifier))
_put_lines(buf, lines)
fmt.buffer_put_lines(buf, lines)

def memory_usage(self, index=True, deep=False):
"""Memory usage of DataFrame columns.
Expand Down
44 changes: 0 additions & 44 deletions pandas/io/formats/common.py

This file was deleted.

280 changes: 280 additions & 0 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# -*- coding: utf-8 -*-
"""
Module for formatting output data into CSV files.
"""

from __future__ import print_function

import csv as csvlib
import numpy as np

from pandas.core.dtypes.missing import notna
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)

from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user,
_stringify_path)
from pandas._libs import writers as libwriters
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import PeriodIndex


class CSVFormatter(object):

def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
float_format=None, cols=None, header=True, index=True,
index_label=None, mode='w', nanRep=None, encoding=None,
compression=None, quoting=None, line_terminator='\n',
chunksize=None, tupleize_cols=False, quotechar='"',
date_format=None, doublequote=True, escapechar=None,
decimal='.'):

self.obj = obj

if path_or_buf is None:
path_or_buf = StringIO()

self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
self.sep = sep
self.na_rep = na_rep
self.float_format = float_format
self.decimal = decimal

self.header = header
self.index = index
self.index_label = index_label
self.mode = mode
self.encoding = encoding
self.compression = compression

if quoting is None:
quoting = csvlib.QUOTE_MINIMAL
self.quoting = quoting

if quoting == csvlib.QUOTE_NONE:
# prevents crash in _csv
quotechar = None
self.quotechar = quotechar

self.doublequote = doublequote
self.escapechar = escapechar

self.line_terminator = line_terminator

self.date_format = date_format

self.tupleize_cols = tupleize_cols
self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
not self.tupleize_cols)

# validate mi options
if self.has_mi_columns:
if cols is not None:
raise TypeError("cannot specify cols with a MultiIndex on the "
"columns")

if cols is not None:
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]

# update columns to include possible multiplicity of dupes
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols, Index):
cols = cols.to_native_types(na_rep=na_rep,
float_format=float_format,
date_format=date_format,
quoting=self.quoting)
else:
cols = list(cols)

# save it
self.cols = cols

# preallocate data 2d list
self.blocks = self.obj._data.blocks
ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols

if chunksize is None:
chunksize = (100000 // (len(self.cols) or 1)) or 1
self.chunksize = int(chunksize)

self.data_index = obj.index
if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
date_format is not None):
self.data_index = Index([x.strftime(date_format) if notna(x) else
'' for x in self.data_index])

self.nlevels = getattr(self.data_index, 'nlevels', 1)
if not index:
self.nlevels = 0

def save(self):
# create the writer & save
if self.encoding is None:
if compat.PY2:
encoding = 'ascii'
else:
encoding = 'utf-8'
else:
encoding = self.encoding

if hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True

try:
writer_kwargs = dict(lineterminator=self.line_terminator,
delimiter=self.sep, quoting=self.quoting,
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar)
if encoding == 'ascii':
self.writer = csvlib.writer(f, **writer_kwargs)
else:
writer_kwargs['encoding'] = encoding
self.writer = UnicodeWriter(f, **writer_kwargs)

self._save()

finally:
if close:
f.close()

def _save_header(self):

writer = self.writer
obj = self.obj
index_label = self.index_label
cols = self.cols
has_mi_columns = self.has_mi_columns
header = self.header
encoded_labels = []

has_aliases = isinstance(header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
if has_aliases:
if len(header) != len(cols):
raise ValueError(('Writing {ncols} cols but got {nalias} '
'aliases'.format(ncols=len(cols),
nalias=len(header))))
else:
write_cols = header
else:
write_cols = cols

if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, MultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ''
index_label.append(name)
else:
index_label = obj.index.name
if index_label is None:
index_label = ['']
else:
index_label = [index_label]
elif not isinstance(index_label,
(list, tuple, np.ndarray, Index)):
# given a string for a DF with Index
index_label = [index_label]

encoded_labels = list(index_label)
else:
encoded_labels = []

if not has_mi_columns or has_aliases:
encoded_labels += list(write_cols)
writer.writerow(encoded_labels)
else:
# write out the mi
columns = obj.columns

# write out the names for each level, then ALL of the values for
# each level
for i in range(columns.nlevels):

# we need at least 1 index column to write our col names
col_line = []
if self.index:

# name is the first column
col_line.append(columns.names[i])

if isinstance(index_label, list) and len(index_label) > 1:
col_line.extend([''] * (len(index_label) - 1))

col_line.extend(columns._get_level_values(i))

writer.writerow(col_line)

# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
if encoded_labels and set(encoded_labels) != set(['']):
encoded_labels.extend([''] * len(columns))
writer.writerow(encoded_labels)

def _save(self):

self._save_header()

nrows = len(self.data_index)

# write in chunksize bites
chunksize = self.chunksize
chunks = int(nrows / chunksize) + 1

for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break

self._save_chunk(start_i, end_i)

def _save_chunk(self, start_i, end_i):

data_index = self.data_index

# create the data for a chunk
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)

for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
decimal=self.decimal,
date_format=self.date_format,
quoting=self.quoting)

libwriters.write_csv_rows(self.data, ix, self.nlevels,
self.cols, self.writer)
Loading