Skip to content

Commit 3d54482

Browse files
committed
perf improvements for other native type writers
1 parent 07b39c8 commit 3d54482

File tree

9 files changed

+87
-51
lines changed

9 files changed

+87
-51
lines changed

doc/source/whatsnew/v0.16.1.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ API changes
8989
Performance Improvements
9090
~~~~~~~~~~~~~~~~~~~~~~~~
9191

92-
- Improved csv write performance with mixed dtypes, including datetimes (:issue:`9940`)
92+
- Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`)
93+
- Improved csv write performance generally by 2x (:issue:`9940`)
9394

9495

9596

pandas/core/format.py

+27-18
Original file line numberDiff line numberDiff line change
@@ -1258,9 +1258,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
12581258
if isinstance(cols, Index):
12591259
cols = cols.to_native_types(na_rep=na_rep,
12601260
float_format=float_format,
1261-
date_format=date_format)
1261+
date_format=date_format,
1262+
quoting=self.quoting)
12621263
else:
1263-
cols = list(cols)
1264+
cols = np.asarray(list(cols))
12641265
self.obj = self.obj.loc[:, cols]
12651266

12661267
# update columns to include possible multiplicity of dupes
@@ -1269,9 +1270,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
12691270
if isinstance(cols, Index):
12701271
cols = cols.to_native_types(na_rep=na_rep,
12711272
float_format=float_format,
1272-
date_format=date_format)
1273+
date_format=date_format,
1274+
quoting=self.quoting)
12731275
else:
1274-
cols = list(cols)
1276+
cols = np.asarray(list(cols))
12751277

12761278
# save it
12771279
self.cols = cols
@@ -1370,8 +1372,10 @@ def strftime_with_nulls(x):
13701372
values = self.obj.copy()
13711373
values.index = data_index
13721374
values.columns = values.columns.to_native_types(
1373-
na_rep=na_rep, float_format=float_format,
1374-
date_format=date_format)
1375+
na_rep=na_rep,
1376+
float_format=float_format,
1377+
date_format=date_format,
1378+
quoting=self.quoting)
13751379
values = values[cols]
13761380

13771381
series = {}
@@ -1542,18 +1546,22 @@ def _save_chunk(self, start_i, end_i):
15421546
slicer = slice(start_i, end_i)
15431547
for i in range(len(self.blocks)):
15441548
b = self.blocks[i]
1545-
d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
1549+
d = b.to_native_types(slicer=slicer,
1550+
na_rep=self.na_rep,
15461551
float_format=self.float_format,
15471552
decimal=self.decimal,
1548-
date_format=self.date_format)
1553+
date_format=self.date_format,
1554+
quoting=self.quoting)
15491555

15501556
for col_loc, col in zip(b.mgr_locs, d):
15511557
# self.data is a preallocated list
15521558
self.data[col_loc] = col
15531559

1554-
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
1560+
ix = data_index.to_native_types(slicer=slicer,
1561+
na_rep=self.na_rep,
15551562
float_format=self.float_format,
1556-
date_format=self.date_format)
1563+
date_format=self.date_format,
1564+
quoting=self.quoting)
15571565

15581566
lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
15591567

@@ -2037,15 +2045,8 @@ def _format_strings(self):
20372045
values = DatetimeIndex(values)
20382046

20392047
if values.tz is None:
2040-
2041-
is_dates_only = _is_dates_only(values)
2042-
if is_dates_only:
2043-
formatter = self.date_format or "%Y-%m-%d"
2044-
else:
2045-
formatter = None
2046-
20472048
fmt_values = format_array_from_datetime(values.asi8.ravel(),
2048-
format=formatter,
2049+
format=_get_format_datetime64_from_values(values, self.date_format),
20492050
na_rep=self.nat_rep).reshape(values.shape)
20502051
fmt_values = fmt_values.tolist()
20512052

@@ -2105,6 +2106,14 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
21052106
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
21062107

21072108

2109+
def _get_format_datetime64_from_values(values, date_format):
2110+
""" given values and a date_format, return a string format """
2111+
is_dates_only = _is_dates_only(values)
2112+
if is_dates_only:
2113+
return date_format or "%Y-%m-%d"
2114+
return None
2115+
2116+
21082117
class Timedelta64Formatter(GenericArrayFormatter):
21092118

21102119
def __init__(self, values, nat_rep='NaT', box=False, **kwargs):

pandas/core/index.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -1071,12 +1071,16 @@ def to_native_types(self, slicer=None, **kwargs):
10711071
values = values[slicer]
10721072
return values._format_native_types(**kwargs)
10731073

1074-
def _format_native_types(self, na_rep='', **kwargs):
1074+
def _format_native_types(self, na_rep='', quoting=None, **kwargs):
10751075
""" actually format my specific types """
10761076
mask = isnull(self)
1077-
values = np.array(self, dtype=object, copy=True)
1077+
if not self.is_object() and not quoting:
1078+
values = np.asarray(self).astype(str)
1079+
else:
1080+
values = np.array(self, dtype=object, copy=True)
1081+
10781082
values[mask] = na_rep
1079-
return values.tolist()
1083+
return values
10801084

10811085
def equals(self, other):
10821086
"""
@@ -3298,7 +3302,7 @@ def _reference_duplicate_name(self, name):
32983302
return np.sum(name == np.asarray(self.names)) > 1
32993303

33003304
def _format_native_types(self, **kwargs):
3301-
return self.tolist()
3305+
return self.values
33023306

33033307
@property
33043308
def _constructor(self):

pandas/core/internals.py

+29-18
Original file line numberDiff line numberDiff line change
@@ -484,16 +484,21 @@ def _try_coerce_and_cast_result(self, result, dtype=None):
484484
def _try_fill(self, value):
485485
return value
486486

487-
def to_native_types(self, slicer=None, na_rep='', **kwargs):
487+
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
488488
""" convert to our native types format, slicing if desired """
489489

490490
values = self.values
491491
if slicer is not None:
492492
values = values[:, slicer]
493-
values = np.array(values, dtype=object)
494493
mask = isnull(values)
494+
495+
if not self.is_object and not quoting:
496+
values = values.astype(str)
497+
else:
498+
values = np.array(values, dtype='object')
499+
495500
values[mask] = na_rep
496-
return values.tolist()
501+
return values
497502

498503
# block actions ####
499504
def copy(self, deep=True):
@@ -1221,32 +1226,34 @@ def _try_cast(self, element):
12211226
return element
12221227

12231228
def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.',
1224-
**kwargs):
1229+
quoting=None, **kwargs):
12251230
""" convert to our native types format, slicing if desired """
12261231

12271232
values = self.values
12281233
if slicer is not None:
12291234
values = values[:, slicer]
1230-
values = np.array(values, dtype=object)
12311235
mask = isnull(values)
1232-
values[mask] = na_rep
1233-
12341236

1237+
formatter = None
12351238
if float_format and decimal != '.':
12361239
formatter = lambda v : (float_format % v).replace('.',decimal,1)
12371240
elif decimal != '.':
12381241
formatter = lambda v : ('%g' % v).replace('.',decimal,1)
12391242
elif float_format:
12401243
formatter = lambda v : float_format % v
1244+
1245+
if formatter is None and not quoting:
1246+
values = values.astype(str)
12411247
else:
1242-
formatter = None
1248+
values = np.array(values, dtype='object')
12431249

1250+
values[mask] = na_rep
12441251
if formatter:
12451252
imask = (~mask).ravel()
12461253
values.flat[imask] = np.array(
12471254
[formatter(val) for val in values.ravel()[imask]])
12481255

1249-
return values.tolist()
1256+
return values
12501257

12511258
def should_store(self, value):
12521259
# when inserting a column should not coerce integers to floats
@@ -1366,7 +1373,7 @@ def _try_coerce_result(self, result):
13661373
def should_store(self, value):
13671374
return issubclass(value.dtype.type, np.timedelta64)
13681375

1369-
def to_native_types(self, slicer=None, na_rep=None, **kwargs):
1376+
def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
13701377
""" convert to our native types format, slicing if desired """
13711378

13721379
values = self.values
@@ -1387,7 +1394,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs):
13871394
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
13881395
for val in values.ravel()[imask]],
13891396
dtype=object)
1390-
return rvalues.tolist()
1397+
return rvalues
13911398

13921399

13931400
def get_values(self, dtype=None):
@@ -1763,18 +1770,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
17631770
ndim=self.ndim,
17641771
placement=self.mgr_locs)
17651772

1766-
def to_native_types(self, slicer=None, na_rep='', **kwargs):
1773+
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
17671774
""" convert to our native types format, slicing if desired """
17681775

17691776
values = self.values
17701777
if slicer is not None:
17711778
# Categorical is always one dimension
17721779
values = values[slicer]
1773-
values = np.array(values, dtype=object)
17741780
mask = isnull(values)
1781+
values = np.array(values, dtype='object')
17751782
values[mask] = na_rep
1776-
# Blocks.to_native_type returns list of lists, but we are always only a list
1777-
return [values.tolist()]
1783+
1784+
# we are expected to return a 2-d ndarray
1785+
return values.reshape(1,len(values))
17781786

17791787
class DatetimeBlock(Block):
17801788
__slots__ = ()
@@ -1864,18 +1872,21 @@ def fillna(self, value, limit=None,
18641872
fastpath=True, placement=self.mgr_locs)]
18651873

18661874
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
1867-
**kwargs):
1875+
quoting=None, **kwargs):
18681876
""" convert to our native types format, slicing if desired """
18691877

18701878
values = self.values
18711879
if slicer is not None:
18721880
values = values[:, slicer]
18731881

1882+
from pandas.core.format import _get_format_datetime64_from_values
1883+
format = _get_format_datetime64_from_values(values, date_format)
1884+
18741885
result = tslib.format_array_from_datetime(values.view('i8').ravel(),
18751886
tz=None,
1876-
format=date_format,
1887+
format=format,
18771888
na_rep=na_rep).reshape(values.shape)
1878-
return result.tolist()
1889+
return result
18791890

18801891
def should_store(self, value):
18811892
return issubclass(value.dtype.type, np.datetime64)

pandas/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,7 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re
933933

934934
@cython.boundscheck(False)
935935
@cython.wraparound(False)
936-
def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer):
936+
def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer):
937937

938938
cdef int N, j, i, ncols
939939
cdef list rows

pandas/tseries/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def groupby(self, f):
6161
return _algos.groupby_object(objs, f)
6262

6363
def _format_with_header(self, header, **kwargs):
64-
return header + self._format_native_types(**kwargs)
64+
return header + list(self._format_native_types(**kwargs))
6565

6666
def __contains__(self, key):
6767
try:

pandas/tseries/index.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -673,11 +673,13 @@ def _add_delta(self, delta):
673673

674674
def _format_native_types(self, na_rep=u('NaT'),
675675
date_format=None, **kwargs):
676-
from pandas.core.format import Datetime64Formatter
677-
return Datetime64Formatter(values=self,
678-
nat_rep=na_rep,
679-
date_format=date_format,
680-
justify='all').get_result()
676+
from pandas.core.format import _get_format_datetime64_from_values
677+
format = _get_format_datetime64_from_values(self, date_format)
678+
679+
return tslib.format_array_from_datetime(self.asi8,
680+
tz=self.tz,
681+
format=format,
682+
na_rep=na_rep)
681683

682684
def to_datetime(self, dayfirst=False):
683685
return self.copy()

pandas/tseries/period.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ def to_datetime(self, dayfirst=False):
387387
qyear = _field_accessor('qyear', 1)
388388
days_in_month = _field_accessor('days_in_month', 11, "The number of days in the month")
389389
daysinmonth = days_in_month
390-
390+
391391
def _get_object_array(self):
392392
freq = self.freq
393393
return np.array([ Period._from_ordinal(ordinal=x, freq=freq) for x in self.values], copy=False)
@@ -687,7 +687,7 @@ def _format_native_types(self, na_rep=u('NaT'), **kwargs):
687687

688688
imask = ~mask
689689
values[imask] = np.array([u('%s') % dt for dt in values[imask]])
690-
return values.tolist()
690+
return values
691691

692692
def __array_finalize__(self, obj):
693693
if not self.ndim: # pragma: no cover

pandas/tslib.pyx

+10-1
Original file line numberDiff line numberDiff line change
@@ -1448,7 +1448,16 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object f
14481448

14491449
else:
14501450
ts = Timestamp(val, tz=tz)
1451-
result[i] = ts.strftime(format)
1451+
if format is None:
1452+
result[i] = str(ts)
1453+
else:
1454+
1455+
# invalid format string
1456+
# requires dates > 1900
1457+
try:
1458+
result[i] = ts.strftime(format)
1459+
except ValueError:
1460+
result[i] = str(ts)
14521461

14531462
return result
14541463

0 commit comments

Comments
 (0)