PERF: changed default value of cache parameter to True in to_datetime function (#26043)

anmyachev · jreback · commit ce567de90e04 · 2019-07-03T21:38:04.000-04:00
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pandas.util.testing as tm
 from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
-from pandas.io.parsers import _parser_defaults
 from io import StringIO
 
 from ..pandas_vb_common import BaseIO
@@ -272,13 +271,12 @@ def setup(self, do_cache):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_cached(self, do_cache):
-        # kwds setting here is used to avoid breaking tests in
-        # previous version of pandas, because this is api changes
-        kwds = {}
-        if 'cache_dates' in _parser_defaults:
-            kwds['cache_dates'] = do_cache
-        read_csv(self.data(self.StringIO_input), header=None,
-                 parse_dates=[0], **kwds)
+        try:
+            read_csv(self.data(self.StringIO_input), header=None,
+                     parse_dates=[0], cache_dates=do_cache)
+        except TypeError:
+            # cache_dates is a new keyword in 0.25
+            pass
 
 
 class ReadCSVMemoryGrowth(BaseIO):
@@ -329,9 +327,14 @@ def setup(self, cache_dates):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_dayfirst(self, cache_dates):
-        read_csv(self.data(self.StringIO_input), sep=',', header=None,
-                 names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
-                 dayfirst=True)
+        try:
+            read_csv(self.data(self.StringIO_input), sep=',', header=None,
+                     names=['Date'], parse_dates=['Date'],
+                     cache_dates=cache_dates,
+                     dayfirst=True)
+        except TypeError:
+            # cache_dates is a new keyword in 0.25
+            pass
 
     def time_to_datetime_dayfirst(self, cache_dates):
         df = read_csv(self.data(self.StringIO_input),
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
         to_datetime(self.stringsD, format='%Y%m%d')
 
 
+class ToDatetimeCacheSmallCount(object):
+
+    params = ([True, False], [50, 500, 5000, 100000])
+    param_names = ['cache', 'count']
+
+    def setup(self, cache, count):
+        rng = date_range(start='1/1/1971', periods=count)
+        self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()
+
+    def time_unique_date_strings(self, cache, count):
+        to_datetime(self.unique_date_strings, cache=cache)
+
+
 class ToDatetimeISO8601:
 
     def setup(self):
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -937,6 +937,7 @@ Performance improvements
 - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
 - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
 - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
+- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
 
 .. _whatsnew_0250.bug_fixes:
 
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -22,6 +22,14 @@
 
 from pandas._typing import ArrayLike
 from pandas.core import algorithms
+from pandas.core.algorithms import unique
+
+# ---------------------------------------------------------------------
+# types used in annotations
+
+ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]
+
+# ---------------------------------------------------------------------
 
 # ---------------------------------------------------------------------
 # types used in annotations
@@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
+def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
+                 check_count: Optional[int] = None) -> bool:
+    """
+    Decides whether to do caching.
+
+    If the percent of unique elements among `check_count` elements less
+    than `unique_share * 100` then we can do caching.
+
+    Parameters
+    ----------
+    arg: listlike, tuple, 1-d array, Series
+    unique_share: float, default=0.7, optional
+        0 < unique_share < 1
+    check_count: int, optional
+        0 <= check_count <= len(arg)
+
+    Returns
+    -------
+    do_caching: bool
+
+    Notes
+    -----
+    By default for a sequence of less than 50 items in size, we don't do
+    caching; for the number of elements less than 5000, we take ten percent of
+    all elements to check for a uniqueness share; if the sequence size is more
+    than 5000, then we check only the first 500 elements.
+    All constants were chosen empirically by.
+    """
+    do_caching = True
+
+    # default realization
+    if check_count is None:
+        # in this case, the gain from caching is negligible
+        if len(arg) <= 50:
+            return False
+
+        if len(arg) <= 5000:
+            check_count = int(len(arg) * 0.1)
+        else:
+            check_count = 500
+    else:
+        assert 0 <= check_count <= len(arg), \
+            'check_count must be in next bounds: [0; len(arg)]'
+        if check_count == 0:
+            return False
+
+    assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
+
+    unique_elements = unique(arg[:check_count])
+    if len(unique_elements) > check_count * unique_share:
+        do_caching = False
+    return do_caching
+
+
 def _maybe_cache(arg, format, cache, convert_listlike):
     """
     Create a cache of unique dates from an array of dates
 
     Parameters
     ----------
-    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+    arg : listlike, tuple, 1-d array, Series
     format : string
         Strftime format to parse time
     cache : boolean
@@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
     cache_array = Series()
     if cache:
         # Perform a quicker unique check
-        from pandas import Index
-        unique_dates = Index(arg).unique()
+        if not should_cache(arg):
+            return cache_array
+
+        unique_dates = unique(arg)
         if len(unique_dates) < len(arg):
-            cache_dates = convert_listlike(unique_dates.to_numpy(),
-                                           True, format)
+            cache_dates = convert_listlike(unique_dates, True, format)
             cache_array = Series(cache_dates, index=unique_dates)
     return cache_array
 
@@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit):
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True,
                 unit=None, infer_datetime_format=False, origin='unix',
-                cache=False):
+                cache=True):
     """
     Convert argument to datetime.
 
@@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
           origin.
 
         .. versionadded:: 0.20.0
-    cache : boolean, default False
+    cache : boolean, default True
         If True, use a cache of unique, converted dates to apply the datetime
         conversion. May produce significant speed-up when parsing duplicate
         date strings, especially ones with timezone offsets.
 
         .. versionadded:: 0.23.0
 
+        .. versionchanged:: 0.25.0
+            - changed default value from False to True
+
     Returns
     -------
     ret : datetime if parsing succeeded.
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
         result = to_datetime([arg], unit='ns', utc=utc)
         expected = to_datetime([exp])
         tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize('listlike,do_caching', [
+    ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
+    ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
+])
+def test_should_cache(listlike, do_caching):
+    assert tools.should_cache(listlike, check_count=len(listlike),
+                              unique_share=0.7) == do_caching
+
+
+@pytest.mark.parametrize('unique_share,check_count, err_message', [
+    (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
+    (10, 2, r'unique_share must be in next bounds: \(0; 1\)')
+])
+def test_should_cache_errors(unique_share, check_count, err_message):
+    arg = [5] * 10
+
+    with pytest.raises(AssertionError, match=err_message):
+        tools.should_cache(arg, unique_share, check_count)
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
         parser.read_csv(StringIO(data), parse_dates=(1,))
 
 
+@pytest.mark.parametrize("cache_dates", [True, False])
+@pytest.mark.parametrize("value", [
+    'nan', '0', ''])
+def test_bad_date_parse(all_parsers, cache_dates, value):
+    # if we have an invalid date make sure that we handle this with
+    # and w/o the cache properly
+    parser = all_parsers
+    s = StringIO(('%s,\n' % value) * 50000)
+
+    parser.read_csv(s,
+                    header=None, names=['foo', 'bar'], parse_dates=['foo'],
+                    infer_datetime_format=False,
+                    cache_dates=cache_dates)
+
+
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers