Skip to content

Commit 07b39c8

Browse files
committed
PERF: improve perf of writing csv's with datetimes
1 parent 878d860 commit 07b39c8

File tree

6 files changed

+105
-55
lines changed

6 files changed

+105
-55
lines changed

doc/source/whatsnew/v0.16.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ API changes
8989
Performance Improvements
9090
~~~~~~~~~~~~~~~~~~~~~~~~
9191

92-
92+
- Improved csv write performance with mixed dtypes, including datetimes (:issue:`9940`)
9393

9494

9595

pandas/core/format.py

+42-34
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,14 @@
1414
from pandas.core.config import get_option, set_option
1515
import pandas.core.common as com
1616
import pandas.lib as lib
17-
from pandas.tslib import iNaT, Timestamp, Timedelta
18-
17+
from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime
18+
from pandas.tseries.index import DatetimeIndex
19+
from pandas.tseries.period import PeriodIndex
1920
import numpy as np
2021

2122
import itertools
2223
import csv
2324

24-
from pandas.tseries.period import PeriodIndex, DatetimeIndex
25-
2625
docstring_to_string = """
2726
Parameters
2827
----------
@@ -2030,16 +2029,50 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
20302029
self.date_format = date_format
20312030

20322031
def _format_strings(self):
2033-
formatter = (self.formatter or
2034-
_get_format_datetime64_from_values(self.values,
2035-
nat_rep=self.nat_rep,
2036-
date_format=self.date_format))
20372032

2038-
fmt_values = [formatter(x) for x in self.values]
2033+
# we may have a tz, if so, then need to process element-by-element
2034+
# when DatetimeBlockWithTimezones is a reality this could be fixed
2035+
values = self.values
2036+
if not isinstance(values, DatetimeIndex):
2037+
values = DatetimeIndex(values)
2038+
2039+
if values.tz is None:
2040+
2041+
is_dates_only = _is_dates_only(values)
2042+
if is_dates_only:
2043+
formatter = self.date_format or "%Y-%m-%d"
2044+
else:
2045+
formatter = None
2046+
2047+
fmt_values = format_array_from_datetime(values.asi8.ravel(),
2048+
format=formatter,
2049+
na_rep=self.nat_rep).reshape(values.shape)
2050+
fmt_values = fmt_values.tolist()
2051+
2052+
else:
2053+
2054+
values = values.asobject
2055+
is_dates_only = _is_dates_only(values)
2056+
formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format))
2057+
fmt_values = [ formatter(x) for x in self.values ]
20392058

20402059
return fmt_values
20412060

20422061

2062+
def _is_dates_only(values):
2063+
# return a boolean if we are only dates (and don't have a timezone)
2064+
values = DatetimeIndex(values)
2065+
if values.tz is not None:
2066+
return False
2067+
2068+
values_int = values.asi8
2069+
consider_values = values_int != iNaT
2070+
one_day_nanos = (86400 * 1e9)
2071+
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
2072+
if even_days:
2073+
return True
2074+
return False
2075+
20432076
def _format_datetime64(x, tz=None, nat_rep='NaT'):
20442077
if x is None or lib.checknull(x):
20452078
return nat_rep
@@ -2062,22 +2095,6 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
20622095
else:
20632096
return x._date_repr
20642097

2065-
2066-
def _is_dates_only(values):
2067-
# return a boolean if we are only dates (and don't have a timezone)
2068-
from pandas import DatetimeIndex
2069-
values = DatetimeIndex(values)
2070-
if values.tz is not None:
2071-
return False
2072-
2073-
values_int = values.asi8
2074-
consider_values = values_int != iNaT
2075-
one_day_nanos = (86400 * 1e9)
2076-
even_days = np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
2077-
if even_days:
2078-
return True
2079-
return False
2080-
20812098
def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
20822099

20832100
if is_dates_only:
@@ -2088,15 +2105,6 @@ def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
20882105
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
20892106

20902107

2091-
def _get_format_datetime64_from_values(values,
2092-
nat_rep='NaT',
2093-
date_format=None):
2094-
is_dates_only = _is_dates_only(values)
2095-
return _get_format_datetime64(is_dates_only=is_dates_only,
2096-
nat_rep=nat_rep,
2097-
date_format=date_format)
2098-
2099-
21002108
class Timedelta64Formatter(GenericArrayFormatter):
21012109

21022110
def __init__(self, values, nat_rep='NaT', box=False, **kwargs):

pandas/core/internals.py

+5-16
Original file line numberDiff line numberDiff line change
@@ -1870,23 +1870,12 @@ def to_native_types(self, slicer=None, na_rep=None, date_format=None,
18701870
values = self.values
18711871
if slicer is not None:
18721872
values = values[:, slicer]
1873-
mask = isnull(values)
18741873

1875-
rvalues = np.empty(values.shape, dtype=object)
1876-
if na_rep is None:
1877-
na_rep = 'NaT'
1878-
rvalues[mask] = na_rep
1879-
imask = (~mask).ravel()
1880-
1881-
if date_format is None:
1882-
date_formatter = lambda x: Timestamp(x)._repr_base
1883-
else:
1884-
date_formatter = lambda x: Timestamp(x).strftime(date_format)
1885-
1886-
rvalues.flat[imask] = np.array([date_formatter(val) for val in
1887-
values.ravel()[imask]], dtype=object)
1888-
1889-
return rvalues.tolist()
1874+
result = tslib.format_array_from_datetime(values.view('i8').ravel(),
1875+
tz=None,
1876+
format=date_format,
1877+
na_rep=na_rep).reshape(values.shape)
1878+
return result.tolist()
18901879

18911880
def should_store(self, value):
18921881
return issubclass(value.dtype.type, np.datetime64)

pandas/tests/test_format.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3010,12 +3010,12 @@ def test_format(self):
30103010

30113011
def test_output_significant_digits(self):
30123012
# Issue #9764
3013-
3013+
30143014
# In case default display precision changes:
30153015
with pd.option_context('display.precision', 7):
30163016
# DataFrame example from issue #9764
30173017
d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]})
3018-
3018+
30193019
expected_output={
30203020
(0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',
30213021
(1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07',

pandas/tseries/index.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -673,9 +673,8 @@ def _add_delta(self, delta):
673673

674674
def _format_native_types(self, na_rep=u('NaT'),
675675
date_format=None, **kwargs):
676-
data = self.asobject
677676
from pandas.core.format import Datetime64Formatter
678-
return Datetime64Formatter(values=data,
677+
return Datetime64Formatter(values=self,
679678
nat_rep=na_rep,
680679
date_format=date_format,
681680
justify='all').get_result()

pandas/tslib.pyx

+54
Original file line numberDiff line numberDiff line change
@@ -1398,6 +1398,60 @@ def parse_datetime_string(date_string, **kwargs):
13981398
dt = parse_date(date_string, **kwargs)
13991399
return dt
14001400

1401+
def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None):
1402+
"""
1403+
return a np object array of the string formatted values
1404+
1405+
Parameters
1406+
----------
1407+
values : a 1-d i8 array
1408+
tz : the timezone (or None)
1409+
format : optional, default is None
1410+
a strftime capable string
1411+
na_rep : optional, default is None
1412+
a nat format
1413+
1414+
"""
1415+
cdef:
1416+
int64_t val, ns, N = len(values)
1417+
ndarray[object] result = np.empty(N, dtype=object)
1418+
object ts, res
1419+
pandas_datetimestruct dts
1420+
1421+
if na_rep is None:
1422+
na_rep = 'NaT'
1423+
1424+
for i in range(N):
1425+
val = values[i]
1426+
1427+
if val == iNaT:
1428+
result[i] = na_rep
1429+
else:
1430+
if format is None and tz is None:
1431+
1432+
pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts)
1433+
res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year,
1434+
dts.month,
1435+
dts.day,
1436+
dts.hour,
1437+
dts.min,
1438+
dts.sec)
1439+
1440+
ns = dts.ps / 1000
1441+
1442+
if ns != 0:
1443+
res += '.%.9d' % (ns + 1000 * dts.us)
1444+
elif dts.us != 0:
1445+
res += '.%.6d' % dts.us
1446+
1447+
result[i] = res
1448+
1449+
else:
1450+
ts = Timestamp(val, tz=tz)
1451+
result[i] = ts.strftime(format)
1452+
1453+
return result
1454+
14011455
def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
14021456
format=None, utc=None, coerce=False, unit=None):
14031457
cdef:

0 commit comments

Comments
 (0)