diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6beb21883b5ab..fbb96380a5813 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -4,7 +4,6 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from pandas.io.parsers import _parser_defaults from io import StringIO from ..pandas_vb_common import BaseIO @@ -272,13 +271,12 @@ def setup(self, do_cache): self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): - # kwds setting here is used to avoid breaking tests in - # previous version of pandas, because this is api changes - kwds = {} - if 'cache_dates' in _parser_defaults: - kwds['cache_dates'] = do_cache - read_csv(self.data(self.StringIO_input), header=None, - parse_dates=[0], **kwds) + try: + read_csv(self.data(self.StringIO_input), header=None, + parse_dates=[0], cache_dates=do_cache) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass class ReadCSVMemoryGrowth(BaseIO): @@ -329,9 +327,14 @@ def setup(self, cache_dates): self.StringIO_input = StringIO(data) def time_read_csv_dayfirst(self, cache_dates): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, - dayfirst=True) + try: + read_csv(self.data(self.StringIO_input), sep=',', header=None, + names=['Date'], parse_dates=['Date'], + cache_dates=cache_dates, + dayfirst=True) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass def time_to_datetime_dayfirst(self, cache_dates): df = read_csv(self.data(self.StringIO_input), diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 7de1c42246ad5..14ee8747cf81d 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') +class ToDatetimeCacheSmallCount(object): + + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ['cache', 'count'] + + def setup(self, cache, count): + rng = date_range(start='1/1/1971', periods=count) + self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + class ToDatetimeISO8601: def setup(self): diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77426e950798c..dd9b83747aa93 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -936,6 +936,7 @@ Performance improvements - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`) - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) +- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d543ae91ad344..3e3318ed4c4b6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -22,6 +22,14 @@ from pandas._typing import ArrayLike from pandas.core import algorithms +from pandas.core.algorithms import unique + +# --------------------------------------------------------------------- +# types used in annotations + +ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries] + +# --------------------------------------------------------------------- # --------------------------------------------------------------------- # types used in annotations @@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) +def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, + check_count: Optional[int] = None) -> bool: + """ + Decides whether to do caching. + + If the percent of unique elements among `check_count` elements less + than `unique_share * 100` then we can do caching. + + Parameters + ---------- + arg: listlike, tuple, 1-d array, Series + unique_share: float, default=0.7, optional + 0 < unique_share < 1 + check_count: int, optional + 0 <= check_count <= len(arg) + + Returns + ------- + do_caching: bool + + Notes + ----- + By default for a sequence of less than 50 items in size, we don't do + caching; for the number of elements less than 5000, we take ten percent of + all elements to check for a uniqueness share; if the sequence size is more + than 5000, then we check only the first 500 elements. + All constants were chosen empirically by. + """ + do_caching = True + + # default realization + if check_count is None: + # in this case, the gain from caching is negligible + if len(arg) <= 50: + return False + + if len(arg) <= 5000: + check_count = int(len(arg) * 0.1) + else: + check_count = 500 + else: + assert 0 <= check_count <= len(arg), \ + 'check_count must be in next bounds: [0; len(arg)]' + if check_count == 0: + return False + + assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + + unique_elements = unique(arg[:check_count]) + if len(unique_elements) > check_count * unique_share: + do_caching = False + return do_caching + + def _maybe_cache(arg, format, cache, convert_listlike): """ Create a cache of unique dates from an array of dates Parameters ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array, Series + arg : listlike, tuple, 1-d array, Series format : string Strftime format to parse time cache : boolean @@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike): cache_array = Series() if cache: # Perform a quicker unique check - from pandas import Index - unique_dates = Index(arg).unique() + if not should_cache(arg): + return cache_array + + unique_dates = unique(arg) if len(unique_dates) < len(arg): - cache_dates = convert_listlike(unique_dates.to_numpy(), - True, format) + cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array @@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit): def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', - cache=False): + cache=True): """ Convert argument to datetime. @@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded:: 0.20.0 - cache : boolean, default False + cache : boolean, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. .. versionadded:: 0.23.0 + .. versionchanged:: 0.25.0 + - changed default value from False to True + Returns ------- ret : datetime if parsing succeeded. diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index f401a7f7c9e9b..784633b2512ce 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp): result = to_datetime([arg], unit='ns', utc=utc) expected = to_datetime([exp]) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('listlike,do_caching', [ + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), + ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True) +]) +def test_should_cache(listlike, do_caching): + assert tools.should_cache(listlike, check_count=len(listlike), + unique_share=0.7) == do_caching + + +@pytest.mark.parametrize('unique_share,check_count, err_message', [ + (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'), + (10, 2, r'unique_share must be in next bounds: \(0; 1\)') +]) +def test_should_cache_errors(unique_share, check_count, err_message): + arg = [5] * 10 + + with pytest.raises(AssertionError, match=err_message): + tools.should_cache(arg, unique_share, check_count) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b0c3944e0aff8..25589a1682f7a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser.read_csv(StringIO(data), parse_dates=(1,)) +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", [ + 'nan', '0', '']) +def test_bad_date_parse(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly + parser = all_parsers + s = StringIO(('%s,\n' % value) * 50000) + + parser.read_csv(s, + header=None, names=['foo', 'bar'], parse_dates=['foo'], + infer_datetime_format=False, + cache_dates=cache_dates) + + def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers