Skip to content

Commit 4a04fae

Browse files
author
y-p
committed
Merge branch 'GH3054/to_csv_perf'
* GH3054/to_csv_perf: (28 commits) TST: mark csv test @slow DOC: update what's new, RELEASE.rst BUG: MultiIndex to_native_types did not obey slicer TST: test the hell out of the new df.to_csv() CLN: preallocate data array only once TST: fail early on duplicate columns ENH: add (undocumented) legacy kwd to df.to_csv, just in case CLN: csv refactor TST: test for to_csv on failing vbench duplicate column names across dtypes is a problem, and not-easy to fix, so letting test fail PERF: avoid iteritems->iloc panelty for data conversion, use blocks ENH: make chunks process constant element count ENH: replace variable lookup by constant . ENH: refactor series from dict to list, eliminate one level of indirection TST: test for to_csv on failing vbench PERF: added frame_to_csv2 vbench, revised frame_to_csv_mixed REF: apply native type conv to ix, cols before write_csv REF: add com._ndarray_to_native_types CLN: make guard more defensive CLN: move repeated cast out of loop ENH: add chunksize parameter to DataFrame.to_csv to enable constant memory usage by writing in chunks ... Conflicts: RELEASE.rst doc/source/v0.11.0.txt pandas/core/internals.py
2 parents f8fbc08 + 8614498 commit 4a04fae

File tree

10 files changed

+618
-122
lines changed

10 files changed

+618
-122
lines changed

RELEASE.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pandas 0.11.0
4747

4848
**Improvements to existing features**
4949

50+
- Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_)
5051
- added ``blocks`` attribute to DataFrames, to return a dict of dtypes to
5152
homogeneously dtyped DataFrames
5253
- added keyword ``convert_numeric`` to ``convert_objects()`` to try to
@@ -248,7 +249,7 @@ pandas 0.11.0
248249
.. _GH3053: https://github.com/pydata/pandas/issues/3053
249250
.. _GH3076: https://github.com/pydata/pandas/issues/3076
250251
.. _GH3063: https://github.com/pydata/pandas/issues/3063
251-
252+
.. _GH3059: https://github.com/pydata/pandas/issues/3039
252253

253254
pandas 0.10.1
254255
=============

doc/source/v0.11.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ API changes
229229
Enhancements
230230
~~~~~~~~~~~~
231231

232+
- Improved performance of dv.to_csv() by up to 10x in some cases. (GH3059_)
233+
232234
- Numexpr is now a :ref:`Recommended Dependencies <install.recommended_dependencies>`, to accelerate certain
233235
types of numerical and boolean operations
234236

@@ -307,3 +309,4 @@ on GitHub for a complete list.
307309
.. _GH2979: https://github.com/pydata/pandas/issues/2979
308310
.. _GH3011: https://github.com/pydata/pandas/issues/3011
309311
.. _GH3076: https://github.com/pydata/pandas/issues/3076
312+
.. _GH3059: https://github.com/pydata/pandas/issues/3059

pandas/core/common.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ def _isnull_old(obj):
101101

102102
_isnull = _isnull_new
103103

104-
105104
def _use_inf_as_null(key):
106105
'''Option change callback for null/inf behaviour
107106
Choose which replacement for numpy.isnan / -numpy.isfinite is used.
@@ -1625,6 +1624,26 @@ def _check_as_is(x):
16251624
# empty queue
16261625
self.queue.truncate(0)
16271626

1627+
def writerows(self, rows):
1628+
def _check_as_is(x):
1629+
return (self.quoting == csv.QUOTE_NONNUMERIC and
1630+
is_number(x)) or isinstance(x, str)
1631+
1632+
for i, row in enumerate(rows):
1633+
rows[i] = [x if _check_as_is(x)
1634+
else pprint_thing(x).encode('utf-8') for x in row]
1635+
1636+
self.writer.writerows([[s for s in row] for row in rows])
1637+
# Fetch UTF-8 output from the queue ...
1638+
data = self.queue.getvalue()
1639+
data = data.decode("utf-8")
1640+
# ... and reencode it into the target encoding
1641+
data = self.encoder.encode(data)
1642+
# write to the target stream
1643+
self.stream.write(data)
1644+
# empty queue
1645+
self.queue.truncate(0)
1646+
16281647

16291648
_NS_DTYPE = np.dtype('M8[ns]')
16301649

pandas/core/format.py

+256-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from io import StringIO
1010

1111
from pandas.core.common import adjoin, isnull, notnull
12-
from pandas.core.index import MultiIndex, _ensure_index
12+
from pandas.core.index import Index, MultiIndex, _ensure_index
1313
from pandas.util import py3compat
1414
from pandas.core.config import get_option, set_option, reset_option
1515
import pandas.core.common as com
@@ -18,6 +18,7 @@
1818
import numpy as np
1919

2020
import itertools
21+
import csv
2122

2223
from pandas.tseries.period import PeriodIndex
2324

@@ -763,6 +764,260 @@ def grouper(x):
763764
return result
764765

765766

767+
class CSVFormatter(object):
768+
769+
def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
770+
cols=None, header=True, index=True, index_label=None,
771+
mode='w', nanRep=None, encoding=None, quoting=None,
772+
line_terminator='\n', chunksize=None,legacy=False):
773+
self.legacy=legacy # remove for 0.12
774+
self.obj = obj
775+
self.path_or_buf = path_or_buf
776+
self.sep = sep
777+
self.na_rep = na_rep
778+
self.float_format = float_format
779+
780+
self.header = header
781+
self.index = index
782+
self.index_label = index_label
783+
self.mode = mode
784+
self.encoding = encoding
785+
786+
if quoting is None:
787+
quoting = csv.QUOTE_MINIMAL
788+
self.quoting = quoting
789+
790+
self.line_terminator = line_terminator
791+
792+
if cols is None:
793+
cols = obj.columns
794+
795+
if isinstance(cols,Index):
796+
cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
797+
else:
798+
cols=list(cols)
799+
self.cols = cols
800+
801+
# preallocate data 2d list
802+
self.blocks = self.obj._data.blocks
803+
ncols = sum(len(b.items) for b in self.blocks)
804+
self.data =[None] * ncols
805+
806+
# fail early if we have duplicate columns
807+
if len(set(self.cols)) != len(self.cols):
808+
raise Exception("duplicate columns are not permitted in to_csv")
809+
810+
self.colname_map = dict((k,i) for i,k in enumerate(obj.columns))
811+
812+
if chunksize is None:
813+
chunksize = (100000/ (len(self.cols) or 1)) or 1
814+
self.chunksize = chunksize
815+
816+
self.data_index = obj.index
817+
if isinstance(obj.index, PeriodIndex):
818+
self.data_index = obj.index.to_timestamp()
819+
820+
self.nlevels = getattr(self.data_index, 'nlevels', 1)
821+
if not index:
822+
self.nlevels = 0
823+
824+
# legacy to be removed in 0.12
825+
def _helper_csv(self, writer, na_rep=None, cols=None,
826+
header=True, index=True,
827+
index_label=None, float_format=None):
828+
if cols is None:
829+
cols = self.columns
830+
831+
series = {}
832+
for k, v in self.obj._series.iteritems():
833+
series[k] = v.values
834+
835+
836+
has_aliases = isinstance(header, (tuple, list, np.ndarray))
837+
if has_aliases or header:
838+
if index:
839+
# should write something for index label
840+
if index_label is not False:
841+
if index_label is None:
842+
if isinstance(self.obj.index, MultiIndex):
843+
index_label = []
844+
for i, name in enumerate(self.obj.index.names):
845+
if name is None:
846+
name = ''
847+
index_label.append(name)
848+
else:
849+
index_label = self.obj.index.name
850+
if index_label is None:
851+
index_label = ['']
852+
else:
853+
index_label = [index_label]
854+
elif not isinstance(index_label, (list, tuple, np.ndarray)):
855+
# given a string for a DF with Index
856+
index_label = [index_label]
857+
858+
encoded_labels = list(index_label)
859+
else:
860+
encoded_labels = []
861+
862+
if has_aliases:
863+
if len(header) != len(cols):
864+
raise ValueError(('Writing %d cols but got %d aliases'
865+
% (len(cols), len(header))))
866+
else:
867+
write_cols = header
868+
else:
869+
write_cols = cols
870+
encoded_cols = list(write_cols)
871+
872+
writer.writerow(encoded_labels + encoded_cols)
873+
else:
874+
encoded_cols = list(cols)
875+
writer.writerow(encoded_cols)
876+
877+
data_index = self.obj.index
878+
if isinstance(self.obj.index, PeriodIndex):
879+
data_index = self.obj.index.to_timestamp()
880+
881+
nlevels = getattr(data_index, 'nlevels', 1)
882+
for j, idx in enumerate(data_index):
883+
row_fields = []
884+
if index:
885+
if nlevels == 1:
886+
row_fields = [idx]
887+
else: # handle MultiIndex
888+
row_fields = list(idx)
889+
for i, col in enumerate(cols):
890+
val = series[col][j]
891+
if lib.checknull(val):
892+
val = na_rep
893+
894+
if float_format is not None and com.is_float(val):
895+
val = float_format % val
896+
elif isinstance(val, np.datetime64):
897+
val = lib.Timestamp(val)._repr_base
898+
899+
row_fields.append(val)
900+
901+
writer.writerow(row_fields)
902+
903+
def save(self):
904+
# create the writer & save
905+
if hasattr(self.path_or_buf, 'read'):
906+
f = self.path_or_buf
907+
close = False
908+
else:
909+
f = com._get_handle(self.path_or_buf, self.mode, encoding=self.encoding)
910+
close = True
911+
912+
try:
913+
if self.encoding is not None:
914+
self.writer = com.UnicodeWriter(f, lineterminator=self.line_terminator,
915+
delimiter=self.sep, encoding=self.encoding,
916+
quoting=self.quoting)
917+
else:
918+
self.writer = csv.writer(f, lineterminator=self.line_terminator,
919+
delimiter=self.sep, quoting=self.quoting)
920+
921+
if self.legacy:
922+
# to be removed in 0.12
923+
self._helper_csv(self.writer, na_rep=self.na_rep,
924+
float_format=self.float_format, cols=self.cols,
925+
header=self.header, index=self.index,
926+
index_label=self.index_label)
927+
928+
else:
929+
self._save()
930+
931+
932+
finally:
933+
if close:
934+
f.close()
935+
936+
def _save_header(self):
937+
938+
writer = self.writer
939+
obj = self.obj
940+
index_label = self.index_label
941+
cols = self.cols
942+
header = self.header
943+
944+
has_aliases = isinstance(header, (tuple, list, np.ndarray))
945+
if has_aliases or self.header:
946+
if self.index:
947+
# should write something for index label
948+
if index_label is not False:
949+
if index_label is None:
950+
if isinstance(obj.index, MultiIndex):
951+
index_label = []
952+
for i, name in enumerate(obj.index.names):
953+
if name is None:
954+
name = ''
955+
index_label.append(name)
956+
else:
957+
index_label = obj.index.name
958+
if index_label is None:
959+
index_label = ['']
960+
else:
961+
index_label = [index_label]
962+
elif not isinstance(index_label, (list, tuple, np.ndarray)):
963+
# given a string for a DF with Index
964+
index_label = [index_label]
965+
966+
encoded_labels = list(index_label)
967+
else:
968+
encoded_labels = []
969+
970+
if has_aliases:
971+
if len(header) != len(cols):
972+
raise ValueError(('Writing %d cols but got %d aliases'
973+
% (len(cols), len(header))))
974+
else:
975+
write_cols = header
976+
else:
977+
write_cols = cols
978+
encoded_cols = list(write_cols)
979+
980+
writer.writerow(encoded_labels + encoded_cols)
981+
else:
982+
encoded_cols = list(cols)
983+
writer.writerow(encoded_cols)
984+
985+
def _save(self):
986+
987+
self._save_header()
988+
989+
nrows = len(self.data_index)
990+
991+
# write in chunksize bites
992+
chunksize = self.chunksize
993+
chunks = int(nrows / chunksize)+1
994+
995+
for i in xrange(chunks):
996+
start_i = i * chunksize
997+
end_i = min((i + 1) * chunksize, nrows)
998+
if start_i >= end_i:
999+
break
1000+
1001+
self._save_chunk(start_i, end_i)
1002+
1003+
def _save_chunk(self, start_i, end_i):
1004+
1005+
colname_map = self.colname_map
1006+
data_index = self.data_index
1007+
1008+
# create the data for a chunk
1009+
slicer = slice(start_i,end_i)
1010+
for i in range(len(self.blocks)):
1011+
b = self.blocks[i]
1012+
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
1013+
for j, k in enumerate(b.items):
1014+
# self.data is a preallocated list
1015+
self.data[colname_map[k]] = d[j]
1016+
1017+
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format)
1018+
1019+
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
1020+
7661021
# from collections import namedtuple
7671022
# ExcelCell = namedtuple("ExcelCell",
7681023
# 'row, col, val, style, mergestart, mergeend')

0 commit comments

Comments
 (0)