Skip to content

Commit ce567de

Browse files
anmyachevjreback
authored andcommitted
PERF: changed default value of cache parameter to True in to_datetime function (#26043)
1 parent c7d7e81 commit ce567de

File tree

6 files changed

+136
-18
lines changed

6 files changed

+136
-18
lines changed

asv_bench/benchmarks/io/csv.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import numpy as np
55
import pandas.util.testing as tm
66
from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
7-
from pandas.io.parsers import _parser_defaults
87
from io import StringIO
98

109
from ..pandas_vb_common import BaseIO
@@ -272,13 +271,12 @@ def setup(self, do_cache):
272271
self.StringIO_input = StringIO(data)
273272

274273
def time_read_csv_cached(self, do_cache):
275-
# kwds setting here is used to avoid breaking tests in
276-
# previous version of pandas, because this is api changes
277-
kwds = {}
278-
if 'cache_dates' in _parser_defaults:
279-
kwds['cache_dates'] = do_cache
280-
read_csv(self.data(self.StringIO_input), header=None,
281-
parse_dates=[0], **kwds)
274+
try:
275+
read_csv(self.data(self.StringIO_input), header=None,
276+
parse_dates=[0], cache_dates=do_cache)
277+
except TypeError:
278+
# cache_dates is a new keyword in 0.25
279+
pass
282280

283281

284282
class ReadCSVMemoryGrowth(BaseIO):
@@ -329,9 +327,14 @@ def setup(self, cache_dates):
329327
self.StringIO_input = StringIO(data)
330328

331329
def time_read_csv_dayfirst(self, cache_dates):
332-
read_csv(self.data(self.StringIO_input), sep=',', header=None,
333-
names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
334-
dayfirst=True)
330+
try:
331+
read_csv(self.data(self.StringIO_input), sep=',', header=None,
332+
names=['Date'], parse_dates=['Date'],
333+
cache_dates=cache_dates,
334+
dayfirst=True)
335+
except TypeError:
336+
# cache_dates is a new keyword in 0.25
337+
pass
335338

336339
def time_to_datetime_dayfirst(self, cache_dates):
337340
df = read_csv(self.data(self.StringIO_input),

asv_bench/benchmarks/timeseries.py

+13
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
300300
to_datetime(self.stringsD, format='%Y%m%d')
301301

302302

303+
class ToDatetimeCacheSmallCount(object):
304+
305+
params = ([True, False], [50, 500, 5000, 100000])
306+
param_names = ['cache', 'count']
307+
308+
def setup(self, cache, count):
309+
rng = date_range(start='1/1/1971', periods=count)
310+
self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()
311+
312+
def time_unique_date_strings(self, cache, count):
313+
to_datetime(self.unique_date_strings, cache=cache)
314+
315+
303316
class ToDatetimeISO8601:
304317

305318
def setup(self):

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,7 @@ Performance improvements
937937
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
938938
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
939939
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
940+
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
940941
941942
.. _whatsnew_0250.bug_fixes:
942943

pandas/core/tools/datetimes.py

+73-7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@
2222

2323
from pandas._typing import ArrayLike
2424
from pandas.core import algorithms
25+
from pandas.core.algorithms import unique
26+
27+
# ---------------------------------------------------------------------
28+
# types used in annotations
29+
30+
ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]
31+
32+
# ---------------------------------------------------------------------
2533

2634
# ---------------------------------------------------------------------
2735
# types used in annotations
@@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
4250
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
4351

4452

53+
def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
54+
check_count: Optional[int] = None) -> bool:
55+
"""
56+
Decides whether to do caching.
57+
58+
If the percent of unique elements among `check_count` elements less
59+
than `unique_share * 100` then we can do caching.
60+
61+
Parameters
62+
----------
63+
arg: listlike, tuple, 1-d array, Series
64+
unique_share: float, default=0.7, optional
65+
0 < unique_share < 1
66+
check_count: int, optional
67+
0 <= check_count <= len(arg)
68+
69+
Returns
70+
-------
71+
do_caching: bool
72+
73+
Notes
74+
-----
75+
By default for a sequence of less than 50 items in size, we don't do
76+
caching; for the number of elements less than 5000, we take ten percent of
77+
all elements to check for a uniqueness share; if the sequence size is more
78+
than 5000, then we check only the first 500 elements.
79+
All constants were chosen empirically by.
80+
"""
81+
do_caching = True
82+
83+
# default realization
84+
if check_count is None:
85+
# in this case, the gain from caching is negligible
86+
if len(arg) <= 50:
87+
return False
88+
89+
if len(arg) <= 5000:
90+
check_count = int(len(arg) * 0.1)
91+
else:
92+
check_count = 500
93+
else:
94+
assert 0 <= check_count <= len(arg), \
95+
'check_count must be in next bounds: [0; len(arg)]'
96+
if check_count == 0:
97+
return False
98+
99+
assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
100+
101+
unique_elements = unique(arg[:check_count])
102+
if len(unique_elements) > check_count * unique_share:
103+
do_caching = False
104+
return do_caching
105+
106+
45107
def _maybe_cache(arg, format, cache, convert_listlike):
46108
"""
47109
Create a cache of unique dates from an array of dates
48110
49111
Parameters
50112
----------
51-
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
113+
arg : listlike, tuple, 1-d array, Series
52114
format : string
53115
Strftime format to parse time
54116
cache : boolean
@@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
65127
cache_array = Series()
66128
if cache:
67129
# Perform a quicker unique check
68-
from pandas import Index
69-
unique_dates = Index(arg).unique()
130+
if not should_cache(arg):
131+
return cache_array
132+
133+
unique_dates = unique(arg)
70134
if len(unique_dates) < len(arg):
71-
cache_dates = convert_listlike(unique_dates.to_numpy(),
72-
True, format)
135+
cache_dates = convert_listlike(unique_dates, True, format)
73136
cache_array = Series(cache_dates, index=unique_dates)
74137
return cache_array
75138

@@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit):
448511
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
449512
utc=None, box=True, format=None, exact=True,
450513
unit=None, infer_datetime_format=False, origin='unix',
451-
cache=False):
514+
cache=True):
452515
"""
453516
Convert argument to datetime.
454517
@@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
529592
origin.
530593
531594
.. versionadded:: 0.20.0
532-
cache : boolean, default False
595+
cache : boolean, default True
533596
If True, use a cache of unique, converted dates to apply the datetime
534597
conversion. May produce significant speed-up when parsing duplicate
535598
date strings, especially ones with timezone offsets.
536599
537600
.. versionadded:: 0.23.0
538601
602+
.. versionchanged:: 0.25.0
603+
- changed default value from False to True
604+
539605
Returns
540606
-------
541607
ret : datetime if parsing succeeded.

pandas/tests/indexes/datetimes/test_tools.py

+20
Original file line numberDiff line numberDiff line change
@@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
20322032
result = to_datetime([arg], unit='ns', utc=utc)
20332033
expected = to_datetime([exp])
20342034
tm.assert_index_equal(result, expected)
2035+
2036+
2037+
@pytest.mark.parametrize('listlike,do_caching', [
2038+
([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
2039+
([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
2040+
])
2041+
def test_should_cache(listlike, do_caching):
2042+
assert tools.should_cache(listlike, check_count=len(listlike),
2043+
unique_share=0.7) == do_caching
2044+
2045+
2046+
@pytest.mark.parametrize('unique_share,check_count, err_message', [
2047+
(0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
2048+
(10, 2, r'unique_share must be in next bounds: \(0; 1\)')
2049+
])
2050+
def test_should_cache_errors(unique_share, check_count, err_message):
2051+
arg = [5] * 10
2052+
2053+
with pytest.raises(AssertionError, match=err_message):
2054+
tools.should_cache(arg, unique_share, check_count)

pandas/tests/io/parser/test_parse_dates.py

+15
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
635635
parser.read_csv(StringIO(data), parse_dates=(1,))
636636

637637

638+
@pytest.mark.parametrize("cache_dates", [True, False])
639+
@pytest.mark.parametrize("value", [
640+
'nan', '0', ''])
641+
def test_bad_date_parse(all_parsers, cache_dates, value):
642+
# if we have an invalid date make sure that we handle this with
643+
# and w/o the cache properly
644+
parser = all_parsers
645+
s = StringIO(('%s,\n' % value) * 50000)
646+
647+
parser.read_csv(s,
648+
header=None, names=['foo', 'bar'], parse_dates=['foo'],
649+
infer_datetime_format=False,
650+
cache_dates=cache_dates)
651+
652+
638653
def test_parse_dates_empty_string(all_parsers):
639654
# see gh-2263
640655
parser = all_parsers

0 commit comments

Comments
 (0)