From c60ad2fabaa5e466f2436ff83a5b71e58743bb49 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Wed, 10 Apr 2019 21:05:52 +0300
Subject: [PATCH 01/20] changed default value of cache parameter to True

---
 pandas/core/tools/datetimes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index d543ae91ad344..bb7e589608404 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -448,7 +448,7 @@ def _adjust_to_origin(arg, origin, unit):
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True,
                 unit=None, infer_datetime_format=False, origin='unix',
-                cache=False):
+                cache=True):
     """
     Convert argument to datetime.
 
@@ -529,12 +529,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
           origin.
 
         .. versionadded:: 0.20.0
-    cache : boolean, default False
+    cache : boolean, default True
         If True, use a cache of unique, converted dates to apply the datetime
         conversion. May produce significant speed-up when parsing duplicate
         date strings, especially ones with timezone offsets.
 
-        .. versionadded:: 0.23.0
+        .. versionchanged:: 0.25.0
 
     Returns
     -------

From 1b71f8c721f67ad3ce84ae286d3ed5a792b9ccc0 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 17 May 2019 15:31:32 +0300
Subject: [PATCH 02/20] added new entry for cache changing

---
 pandas/core/tools/datetimes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index bb7e589608404..456359ede6566 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -534,7 +534,11 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
         conversion. May produce significant speed-up when parsing duplicate
         date strings, especially ones with timezone offsets.
 
-        .. versionchanged:: 0.25.0
+        .. versionadded:: 0.25.0
+
+        cache : boolean, default False
+
+            .. versionadded:: 0.23.0
 
     Returns
     -------

From 461776263e9d3e7c4fc2a4563f2fbc3b65a3b058 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Fri, 17 May 2019 15:42:01 +0300
Subject: [PATCH 03/20] added whatsnew note in the performance section

---
 doc/source/whatsnew/v0.25.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index ea6a04ac726b7..aa81a687d639d 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -879,6 +879,7 @@ Performance improvements
 - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
 - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
 - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
+- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
 
 .. _whatsnew_0250.bug_fixes:
 

From 4fcb04ac365a89f863157c925e429d8a7add3f3b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 20 May 2019 14:46:48 +0300
Subject: [PATCH 04/20] removed: 'cache : boolean, default False'

---
 pandas/core/tools/datetimes.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 456359ede6566..b915083e07b31 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -536,10 +536,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
 
         .. versionadded:: 0.25.0
 
-        cache : boolean, default False
-
-            .. versionadded:: 0.23.0
-
     Returns
     -------
     ret : datetime if parsing succeeded.

From db0fe62739ccf65b6b060dfa861bc78a9a9748e0 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 20 May 2019 15:45:44 +0300
Subject: [PATCH 05/20] added '.. versionchanged'

---
 pandas/core/tools/datetimes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index b915083e07b31..f6fe000a7f34c 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -534,7 +534,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
         conversion. May produce significant speed-up when parsing duplicate
         date strings, especially ones with timezone offsets.
 
-        .. versionadded:: 0.25.0
+        .. versionadded:: 0.23.0
+
+        .. versionchanged:: 0.25.0
+            - changed default value from False to True
 
     Returns
     -------

From 7d9f0efe28e337b928daf25dbb974ae386cb1e89 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Sun, 9 Jun 2019 23:25:35 +0300
Subject: [PATCH 06/20] added benchmark

---
 asv_bench/benchmarks/timeseries.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index 7de1c42246ad5..14ee8747cf81d 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
         to_datetime(self.stringsD, format='%Y%m%d')
 
 
+class ToDatetimeCacheSmallCount(object):
+
+    params = ([True, False], [50, 500, 5000, 100000])
+    param_names = ['cache', 'count']
+
+    def setup(self, cache, count):
+        rng = date_range(start='1/1/1971', periods=count)
+        self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()
+
+    def time_unique_date_strings(self, cache, count):
+        to_datetime(self.unique_date_strings, cache=cache)
+
+
 class ToDatetimeISO8601:
 
     def setup(self):

From c51f00defd410d7c1a70ac96671abcec3e113293 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Sun, 9 Jun 2019 23:28:10 +0300
Subject: [PATCH 07/20] added heuristic to decrease slowdowns for unique arrays

---
 pandas/core/tools/datetimes.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index f6fe000a7f34c..ade560c4e1b71 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -42,6 +42,31 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
+def do_cache(arg, check_count: int, unique_share: float):
+    """
+    Decides whether to do caching.
+
+    If the percent of unique elements among `check_count` elements less
+    than `unique_share * 100` then we can do caching.
+
+    Parameters
+    ----------
+    arg: list, tuple, 1-d array, Series
+    check_count: int
+    unique_share: float
+
+    Returns
+    -------
+    : bool
+    """
+    from pandas.core.algorithms import unique
+
+    unique = unique(arg[:check_count])
+    if len(unique) > check_count * unique_share:
+        return False
+    return True
+
+
 def _maybe_cache(arg, format, cache, convert_listlike):
     """
     Create a cache of unique dates from an array of dates
@@ -66,6 +91,10 @@ def _maybe_cache(arg, format, cache, convert_listlike):
     if cache:
         # Perform a quicker unique check
         from pandas import Index
+
+        if not do_cache(arg, int(len(arg) * 0.1), 0.7):
+            return cache_array
+
         unique_dates = Index(arg).unique()
         if len(unique_dates) < len(arg):
             cache_dates = convert_listlike(unique_dates.to_numpy(),

From 417e005d3432c2763d688fa91ce623709dccfb3f Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 10 Jun 2019 12:45:22 +0300
Subject: [PATCH 08/20] corrected the code due to the reviewers comments

---
 pandas/core/tools/datetimes.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index ade560c4e1b71..1270a35a9a837 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -12,6 +12,7 @@
 from pandas._libs.tslibs.strptime import array_strptime
 from pandas.util._decorators import deprecate_kwarg
 
+from pandas.core.algorithms import unique
 from pandas.core.dtypes.common import (
     ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
     is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
@@ -42,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
-def do_cache(arg, check_count: int, unique_share: float):
+def should_cache(arg, check_count: int, unique_share: float):
     """
     Decides whether to do caching.
 
@@ -51,20 +52,25 @@ def do_cache(arg, check_count: int, unique_share: float):
 
     Parameters
     ----------
-    arg: list, tuple, 1-d array, Series
+    arg: listlike, tuple, 1-d array, Series
     check_count: int
+        0 < check_count <= len(arg)
     unique_share: float
+        0 < unique_share < 1
 
     Returns
     -------
-    : bool
+    do_caching: bool
     """
-    from pandas.core.algorithms import unique
+    assert 0 < check_count <= len(arg)
+    assert 0 < unique_share < 1
 
-    unique = unique(arg[:check_count])
-    if len(unique) > check_count * unique_share:
-        return False
-    return True
+    do_caching = True
+
+    unique_elements = unique(arg[:check_count])
+    if len(unique_elements) > check_count * unique_share:
+        do_caching = False
+    return do_caching
 
 
 def _maybe_cache(arg, format, cache, convert_listlike):
@@ -73,7 +79,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
 
     Parameters
     ----------
-    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+    arg : listlike, tuple, 1-d array, Series
     format : string
         Strftime format to parse time
     cache : boolean
@@ -92,7 +98,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
         # Perform a quicker unique check
         from pandas import Index
 
-        if not do_cache(arg, int(len(arg) * 0.1), 0.7):
+        if not should_cache(arg, int(len(arg) * 0.1), 0.7):
             return cache_array
 
         unique_dates = Index(arg).unique()

From ed0725ef06be5fa4ab4e77635318dd5b99c4192f Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 10 Jun 2019 13:22:22 +0300
Subject: [PATCH 09/20] added errors message; added tests

---
 pandas/core/tools/datetimes.py               |  5 +++--
 pandas/tests/indexes/datetimes/test_tools.py | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 1270a35a9a837..0a84452e53d5d 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -62,8 +62,9 @@ def should_cache(arg, check_count: int, unique_share: float):
     -------
     do_caching: bool
     """
-    assert 0 < check_count <= len(arg)
-    assert 0 < unique_share < 1
+    assert 0 < check_count <= len(arg), ('check_count must be in next bounds: '
+                                         '(0; len(arg)]')
+    assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
 
     do_caching = True
 
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
index f401a7f7c9e9b..690102d1be017 100644
--- a/pandas/tests/indexes/datetimes/test_tools.py
+++ b/pandas/tests/indexes/datetimes/test_tools.py
@@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
         result = to_datetime([arg], unit='ns', utc=utc)
         expected = to_datetime([exp])
         tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize('listlike,do_caching', [
+    ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
+    ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
+])
+def test_should_cache(listlike, do_caching):
+    assert tools.should_cache(listlike, check_count=len(listlike),
+                              unique_share=0.7) == do_caching
+
+
+@pytest.mark.parametrize('check_count,unique_share, err_message', [
+    (11, 0.5, r'check_count must be in next bounds: \(0; len\(arg\)]'),
+    (10, 2, r'unique_share must be in next bounds: \(0; 1\)')
+])
+def test_should_cache_errors(check_count, unique_share, err_message):
+    arg = [5] * 10
+
+    with pytest.raises(AssertionError, match=err_message):
+        tools.should_cache(arg, check_count, unique_share)

From 14ef1f23f8593dd63a81cf0f6dc6b531db32534b Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 10 Jun 2019 15:10:43 +0300
Subject: [PATCH 10/20] fix bug when 'check_count' == 0

---
 pandas/core/tools/datetimes.py               | 9 ++++++---
 pandas/tests/indexes/datetimes/test_tools.py | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 0a84452e53d5d..7e6ed6321ef46 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -54,7 +54,7 @@ def should_cache(arg, check_count: int, unique_share: float):
     ----------
     arg: listlike, tuple, 1-d array, Series
     check_count: int
-        0 < check_count <= len(arg)
+        0 <= check_count <= len(arg)
     unique_share: float
         0 < unique_share < 1
 
@@ -62,10 +62,13 @@ def should_cache(arg, check_count: int, unique_share: float):
     -------
     do_caching: bool
     """
-    assert 0 < check_count <= len(arg), ('check_count must be in next bounds: '
-                                         '(0; len(arg)]')
+    assert 0 <= check_count <= len(arg), ('check_count must be in next bounds:'
+                                          ' [0; len(arg)]')
     assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
 
+    if check_count == 0:
+        return False
+
     do_caching = True
 
     unique_elements = unique(arg[:check_count])
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
index 690102d1be017..5d351b361a7a0 100644
--- a/pandas/tests/indexes/datetimes/test_tools.py
+++ b/pandas/tests/indexes/datetimes/test_tools.py
@@ -2044,7 +2044,7 @@ def test_should_cache(listlike, do_caching):
 
 
 @pytest.mark.parametrize('check_count,unique_share, err_message', [
-    (11, 0.5, r'check_count must be in next bounds: \(0; len\(arg\)]'),
+    (11, 0.5, r'check_count must be in next bounds: [0; len\(arg\)]'),
     (10, 2, r'unique_share must be in next bounds: \(0; 1\)')
 ])
 def test_should_cache_errors(check_count, unique_share, err_message):

From 0c2aaeaed1800c71324c6e8fc83b750327f715fd Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Mon, 10 Jun 2019 15:50:53 +0300
Subject: [PATCH 11/20] added escape symbols

---
 pandas/tests/indexes/datetimes/test_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
index 5d351b361a7a0..44d6792d01b02 100644
--- a/pandas/tests/indexes/datetimes/test_tools.py
+++ b/pandas/tests/indexes/datetimes/test_tools.py
@@ -2044,7 +2044,7 @@ def test_should_cache(listlike, do_caching):
 
 
 @pytest.mark.parametrize('check_count,unique_share, err_message', [
-    (11, 0.5, r'check_count must be in next bounds: [0; len\(arg\)]'),
+    (11, 0.5, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
     (10, 2, r'unique_share must be in next bounds: \(0; 1\)')
 ])
 def test_should_cache_errors(check_count, unique_share, err_message):

From 0fde7c8434a40e599b2ef3b1e41909a5d48b0ca8 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 11 Jun 2019 17:11:41 +0300
Subject: [PATCH 12/20] attempt to improve perfomance

---
 pandas/core/tools/datetimes.py               | 33 +++++++++++++-------
 pandas/tests/indexes/datetimes/test_tools.py |  8 ++---
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 7e6ed6321ef46..85df2ddafb51c 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -43,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
-def should_cache(arg, check_count: int, unique_share: float):
+def should_cache(arg, unique_share=0.7, check_count=None):
     """
     Decides whether to do caching.
 
@@ -53,23 +53,34 @@ def should_cache(arg, check_count: int, unique_share: float):
     Parameters
     ----------
     arg: listlike, tuple, 1-d array, Series
-    check_count: int
-        0 <= check_count <= len(arg)
-    unique_share: float
+    unique_share: float or None
         0 < unique_share < 1
+    check_count: int or None
+        0 <= check_count <= len(arg)
 
     Returns
     -------
     do_caching: bool
     """
-    assert 0 <= check_count <= len(arg), ('check_count must be in next bounds:'
-                                          ' [0; len(arg)]')
-    assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
+    do_caching = True
 
-    if check_count == 0:
-        return False
+    # default realization
+    if check_count is None:
+        # in this case, the gain from caching is negligible
+        if len(arg) <= 50:
+            return False
 
-    do_caching = True
+        if len(arg) <= 5000:
+            check_count = int(len(arg) * 0.1)
+        else:
+            check_count = 500
+    else:
+        assert 0 <= check_count <= len(arg), \
+            'check_count must be in next bounds: [0; len(arg)]'
+        assert 0 < unique_share < 1, \
+            'unique_share must be in next bounds: (0; 1)'
+        if check_count == 0:
+            return False
 
     unique_elements = unique(arg[:check_count])
     if len(unique_elements) > check_count * unique_share:
@@ -102,7 +113,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
         # Perform a quicker unique check
         from pandas import Index
 
-        if not should_cache(arg, int(len(arg) * 0.1), 0.7):
+        if not should_cache(arg):
             return cache_array
 
         unique_dates = Index(arg).unique()
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
index 44d6792d01b02..784633b2512ce 100644
--- a/pandas/tests/indexes/datetimes/test_tools.py
+++ b/pandas/tests/indexes/datetimes/test_tools.py
@@ -2043,12 +2043,12 @@ def test_should_cache(listlike, do_caching):
                               unique_share=0.7) == do_caching
 
 
-@pytest.mark.parametrize('check_count,unique_share, err_message', [
-    (11, 0.5, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
+@pytest.mark.parametrize('unique_share,check_count, err_message', [
+    (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
     (10, 2, r'unique_share must be in next bounds: \(0; 1\)')
 ])
-def test_should_cache_errors(check_count, unique_share, err_message):
+def test_should_cache_errors(unique_share, check_count, err_message):
     arg = [5] * 10
 
     with pytest.raises(AssertionError, match=err_message):
-        tools.should_cache(arg, check_count, unique_share)
+        tools.should_cache(arg, unique_share, check_count)

From c74597abe8021a3dbc96b6cf5f52abafc141b850 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Tue, 11 Jun 2019 18:18:27 +0300
Subject: [PATCH 13/20] fix bug

---
 pandas/core/tools/datetimes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 85df2ddafb51c..a9f8da84713ac 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -77,11 +77,11 @@ def should_cache(arg, unique_share=0.7, check_count=None):
     else:
         assert 0 <= check_count <= len(arg), \
             'check_count must be in next bounds: [0; len(arg)]'
-        assert 0 < unique_share < 1, \
-            'unique_share must be in next bounds: (0; 1)'
         if check_count == 0:
             return False
 
+    assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
+
     unique_elements = unique(arg[:check_count])
     if len(unique_elements) > check_count * unique_share:
         do_caching = False

From 98e18a808d521c22a24b72b46f8de183fa9499ad Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Thu, 13 Jun 2019 14:19:28 +0300
Subject: [PATCH 14/20] fixed problems found by review

---
 pandas/core/tools/datetimes.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index a9f8da84713ac..d3dc60fb08b38 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -12,7 +12,6 @@
 from pandas._libs.tslibs.strptime import array_strptime
 from pandas.util._decorators import deprecate_kwarg
 
-from pandas.core.algorithms import unique
 from pandas.core.dtypes.common import (
     ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
     is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
@@ -23,6 +22,14 @@
 
 from pandas._typing import ArrayLike
 from pandas.core import algorithms
+from pandas.core.algorithms import unique
+
+# ---------------------------------------------------------------------
+# types used in annotations
+
+ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]
+
+# ---------------------------------------------------------------------
 
 # ---------------------------------------------------------------------
 # types used in annotations
@@ -43,7 +50,8 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
-def should_cache(arg, unique_share=0.7, check_count=None):
+def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
+                 check_count: Optional[int] = None) -> bool:
     """
     Decides whether to do caching.
 
@@ -53,14 +61,22 @@ def should_cache(arg, unique_share=0.7, check_count=None):
     Parameters
     ----------
     arg: listlike, tuple, 1-d array, Series
-    unique_share: float or None
+    unique_share: float, default=0.7, optional
         0 < unique_share < 1
-    check_count: int or None
+    check_count: int, optional
         0 <= check_count <= len(arg)
 
     Returns
     -------
     do_caching: bool
+
+    Notes
+    -----
+    By default for a sequence of less than 50 items in size, we don't do
+    caching; for the number of elements less than 5000, we take ten percent of
+    all elements to check for a uniqueness share; if the sequence size is more
+    than 5000, then we check only the first 500 elements.
+    All constants were chosen empirically by.
     """
     do_caching = True
 

From 201e7fc5b6c24f5b3d2b0ed39969edf469ece552 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Wed, 3 Jul 2019 22:21:38 +0300
Subject: [PATCH 15/20] removed excess conversion 'to_numpy'

---
 pandas/core/tools/datetimes.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index d3dc60fb08b38..4f1bdbda25828 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -134,8 +134,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
 
         unique_dates = Index(arg).unique()
         if len(unique_dates) < len(arg):
-            cache_dates = convert_listlike(unique_dates.to_numpy(),
-                                           True, format)
+            cache_dates = convert_listlike(unique_dates, True, format)
             cache_array = Series(cache_dates, index=unique_dates)
     return cache_array
 

From 209bb83f49e402d5e1d542be5d5c83637d194717 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 3 Jul 2019 16:56:53 -0400
Subject: [PATCH 16/20] fix up bencharks

---
 asv_bench/benchmarks/io/csv.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index 6beb21883b5ab..bc12f4b266ac4 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pandas.util.testing as tm
 from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
-from pandas.io.parsers import _parser_defaults
 from io import StringIO
 
 from ..pandas_vb_common import BaseIO
@@ -272,13 +271,12 @@ def setup(self, do_cache):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_cached(self, do_cache):
-        # kwds setting here is used to avoid breaking tests in
-        # previous version of pandas, because this is api changes
-        kwds = {}
-        if 'cache_dates' in _parser_defaults:
-            kwds['cache_dates'] = do_cache
-        read_csv(self.data(self.StringIO_input), header=None,
-                 parse_dates=[0], **kwds)
+        try:
+            read_csv(self.data(self.StringIO_input), header=None,
+                     parse_dates=[0], cache_dates=do_cache)
+        except TypeError:
+            # cache_dates is a new keyword in 0.25
+            pass
 
 
 class ReadCSVMemoryGrowth(BaseIO):
@@ -329,9 +327,14 @@ def setup(self, cache_dates):
         self.StringIO_input = StringIO(data)
 
     def time_read_csv_dayfirst(self, cache_dates):
-        read_csv(self.data(self.StringIO_input), sep=',', header=None,
-                 names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
-                 dayfirst=True)
+        try:
+            read_csv(self.data(self.StringIO_input), sep=',', header=None,
+                     names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
+                     dayfirst=True)
+        except TypeError:
+            # cache_dates is a new keyword in 0.25
+            pass
+
 
     def time_to_datetime_dayfirst(self, cache_dates):
         df = read_csv(self.data(self.StringIO_input),

From 1c78f5045e855ecc026e6dd763965b889da7eaeb Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Wed, 3 Jul 2019 15:58:41 -0500
Subject: [PATCH 17/20] lint

---
 asv_bench/benchmarks/io/csv.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index bc12f4b266ac4..de2d3b46eb448 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -329,7 +329,8 @@ def setup(self, cache_dates):
     def time_read_csv_dayfirst(self, cache_dates):
         try:
             read_csv(self.data(self.StringIO_input), sep=',', header=None,
-                     names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
+                     names=['Date'], parse_dates=['Date'],
+                     cache_dates=cache_dates,
                      dayfirst=True)
         except TypeError:
             # cache_dates is a new keyword in 0.25

From 7dffc3e1f86303ef7570e39b9dcc67ae0f6ffbb2 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 3 Jul 2019 17:38:29 -0400
Subject: [PATCH 18/20] lint

---
 asv_bench/benchmarks/io/csv.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index de2d3b46eb448..fbb96380a5813 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -336,7 +336,6 @@ def time_read_csv_dayfirst(self, cache_dates):
             # cache_dates is a new keyword in 0.25
             pass
 
-
     def time_to_datetime_dayfirst(self, cache_dates):
         df = read_csv(self.data(self.StringIO_input),
                       dtype={'date': str}, names=['date'])

From 40114b7242dbaf09c99731adc295b07206129251 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 3 Jul 2019 19:24:35 -0400
Subject: [PATCH 19/20] patch for cache with bad dates

---
 pandas/core/tools/datetimes.py             |  2 +-
 pandas/tests/io/parser/test_parse_dates.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 4f1bdbda25828..29e0712acc8bc 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -132,7 +132,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
         if not should_cache(arg):
             return cache_array
 
-        unique_dates = Index(arg).unique()
+        unique_dates = Index(arg).unique().values
         if len(unique_dates) < len(arg):
             cache_dates = convert_listlike(unique_dates, True, format)
             cache_array = Series(cache_dates, index=unique_dates)
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index b0c3944e0aff8..25589a1682f7a 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
         parser.read_csv(StringIO(data), parse_dates=(1,))
 
 
+@pytest.mark.parametrize("cache_dates", [True, False])
+@pytest.mark.parametrize("value", [
+    'nan', '0', ''])
+def test_bad_date_parse(all_parsers, cache_dates, value):
+    # if we have an invalid date make sure that we handle this with
+    # and w/o the cache properly
+    parser = all_parsers
+    s = StringIO(('%s,\n' % value) * 50000)
+
+    parser.read_csv(s,
+                    header=None, names=['foo', 'bar'], parse_dates=['foo'],
+                    infer_datetime_format=False,
+                    cache_dates=cache_dates)
+
+
 def test_parse_dates_empty_string(all_parsers):
     # see gh-2263
     parser = all_parsers

From ebc8815fbde7728527bafb16efd2dfb86a4032bd Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 3 Jul 2019 19:33:56 -0400
Subject: [PATCH 20/20] better unique

---
 pandas/core/tools/datetimes.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 29e0712acc8bc..3e3318ed4c4b6 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -127,12 +127,10 @@ def _maybe_cache(arg, format, cache, convert_listlike):
     cache_array = Series()
     if cache:
         # Perform a quicker unique check
-        from pandas import Index
-
         if not should_cache(arg):
             return cache_array
 
-        unique_dates = Index(arg).unique().values
+        unique_dates = unique(arg)
         if len(unique_dates) < len(arg):
             cache_dates = convert_listlike(unique_dates, True, format)
             cache_array = Series(cache_dates, index=unique_dates)