From c60ad2fabaa5e466f2436ff83a5b71e58743bb49 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 10 Apr 2019 21:05:52 +0300 Subject: [PATCH 01/20] changed default value of cache parameter to True --- pandas/core/tools/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d543ae91ad344..bb7e589608404 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -448,7 +448,7 @@ def _adjust_to_origin(arg, origin, unit): def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', - cache=False): + cache=True): """ Convert argument to datetime. @@ -529,12 +529,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded:: 0.20.0 - cache : boolean, default False + cache : boolean, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded:: 0.23.0 + .. versionchanged:: 0.25.0 Returns ------- From 1b71f8c721f67ad3ce84ae286d3ed5a792b9ccc0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 17 May 2019 15:31:32 +0300 Subject: [PATCH 02/20] added new entry for cache changing --- pandas/core/tools/datetimes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bb7e589608404..456359ede6566 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -534,7 +534,11 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionchanged:: 0.25.0 + .. versionadded:: 0.25.0 + + cache : boolean, default False + + .. versionadded:: 0.23.0 Returns ------- From 461776263e9d3e7c4fc2a4563f2fbc3b65a3b058 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 17 May 2019 15:42:01 +0300 Subject: [PATCH 03/20] added whatsnew note in the performance section --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ea6a04ac726b7..aa81a687d639d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -879,6 +879,7 @@ Performance improvements - Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`) - Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`) - Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`) +- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`) .. _whatsnew_0250.bug_fixes: From 4fcb04ac365a89f863157c925e429d8a7add3f3b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 20 May 2019 14:46:48 +0300 Subject: [PATCH 04/20] removed: 'cache : boolean, default False' --- pandas/core/tools/datetimes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 456359ede6566..b915083e07b31 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -536,10 +536,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, .. versionadded:: 0.25.0 - cache : boolean, default False - - .. versionadded:: 0.23.0 - Returns ------- ret : datetime if parsing succeeded. From db0fe62739ccf65b6b060dfa861bc78a9a9748e0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 20 May 2019 15:45:44 +0300 Subject: [PATCH 05/20] added '.. versionchanged' --- pandas/core/tools/datetimes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b915083e07b31..f6fe000a7f34c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -534,7 +534,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. - .. versionadded:: 0.25.0 + .. versionadded:: 0.23.0 + + .. versionchanged:: 0.25.0 + - changed default value from False to True Returns ------- From 7d9f0efe28e337b928daf25dbb974ae386cb1e89 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 9 Jun 2019 23:25:35 +0300 Subject: [PATCH 06/20] added benchmark --- asv_bench/benchmarks/timeseries.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 7de1c42246ad5..14ee8747cf81d 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') +class ToDatetimeCacheSmallCount(object): + + params = ([True, False], [50, 500, 5000, 100000]) + param_names = ['cache', 'count'] + + def setup(self, cache, count): + rng = date_range(start='1/1/1971', periods=count) + self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist() + + def time_unique_date_strings(self, cache, count): + to_datetime(self.unique_date_strings, cache=cache) + + class ToDatetimeISO8601: def setup(self): From c51f00defd410d7c1a70ac96671abcec3e113293 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 9 Jun 2019 23:28:10 +0300 Subject: [PATCH 07/20] added heuristic to decrease slowdowns for unique arrays --- pandas/core/tools/datetimes.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f6fe000a7f34c..ade560c4e1b71 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -42,6 +42,31 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) +def do_cache(arg, check_count: int, unique_share: float): + """ + Decides whether to do caching. + + If the percent of unique elements among `check_count` elements less + than `unique_share * 100` then we can do caching. + + Parameters + ---------- + arg: list, tuple, 1-d array, Series + check_count: int + unique_share: float + + Returns + ------- + : bool + """ + from pandas.core.algorithms import unique + + unique = unique(arg[:check_count]) + if len(unique) > check_count * unique_share: + return False + return True + + def _maybe_cache(arg, format, cache, convert_listlike): """ Create a cache of unique dates from an array of dates @@ -66,6 +91,10 @@ def _maybe_cache(arg, format, cache, convert_listlike): if cache: # Perform a quicker unique check from pandas import Index + + if not do_cache(arg, int(len(arg) * 0.1), 0.7): + return cache_array + unique_dates = Index(arg).unique() if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates.to_numpy(), From 417e005d3432c2763d688fa91ce623709dccfb3f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 10 Jun 2019 12:45:22 +0300 Subject: [PATCH 08/20] corrected the code due to the reviewers comments --- pandas/core/tools/datetimes.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ade560c4e1b71..1270a35a9a837 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -12,6 +12,7 @@ from pandas._libs.tslibs.strptime import array_strptime from pandas.util._decorators import deprecate_kwarg +from pandas.core.algorithms import unique from pandas.core.dtypes.common import ( ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, @@ -42,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def do_cache(arg, check_count: int, unique_share: float): +def should_cache(arg, check_count: int, unique_share: float): """ Decides whether to do caching. @@ -51,20 +52,25 @@ def do_cache(arg, check_count: int, unique_share: float): Parameters ---------- - arg: list, tuple, 1-d array, Series + arg: listlike, tuple, 1-d array, Series check_count: int + 0 < check_count <= len(arg) unique_share: float + 0 < unique_share < 1 Returns ------- - : bool + do_caching: bool """ - from pandas.core.algorithms import unique + assert 0 < check_count <= len(arg) + assert 0 < unique_share < 1 - unique = unique(arg[:check_count]) - if len(unique) > check_count * unique_share: - return False - return True + do_caching = True + + unique_elements = unique(arg[:check_count]) + if len(unique_elements) > check_count * unique_share: + do_caching = False + return do_caching def _maybe_cache(arg, format, cache, convert_listlike): @@ -73,7 +79,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): Parameters ---------- - arg : integer, float, string, datetime, list, tuple, 1-d array, Series + arg : listlike, tuple, 1-d array, Series format : string Strftime format to parse time cache : boolean @@ -92,7 +98,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): # Perform a quicker unique check from pandas import Index - if not do_cache(arg, int(len(arg) * 0.1), 0.7): + if not should_cache(arg, int(len(arg) * 0.1), 0.7): return cache_array unique_dates = Index(arg).unique() From ed0725ef06be5fa4ab4e77635318dd5b99c4192f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 10 Jun 2019 13:22:22 +0300 Subject: [PATCH 09/20] added errors message; added tests --- pandas/core/tools/datetimes.py | 5 +++-- pandas/tests/indexes/datetimes/test_tools.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1270a35a9a837..0a84452e53d5d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -62,8 +62,9 @@ def should_cache(arg, check_count: int, unique_share: float): ------- do_caching: bool """ - assert 0 < check_count <= len(arg) - assert 0 < unique_share < 1 + assert 0 < check_count <= len(arg), ('check_count must be in next bounds: ' + '(0; len(arg)]') + assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' do_caching = True diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index f401a7f7c9e9b..690102d1be017 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp): result = to_datetime([arg], unit='ns', utc=utc) expected = to_datetime([exp]) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('listlike,do_caching', [ + ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), + ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True) +]) +def test_should_cache(listlike, do_caching): + assert tools.should_cache(listlike, check_count=len(listlike), + unique_share=0.7) == do_caching + + +@pytest.mark.parametrize('check_count,unique_share, err_message', [ + (11, 0.5, r'check_count must be in next bounds: \(0; len\(arg\)]'), + (10, 2, r'unique_share must be in next bounds: \(0; 1\)') +]) +def test_should_cache_errors(check_count, unique_share, err_message): + arg = [5] * 10 + + with pytest.raises(AssertionError, match=err_message): + tools.should_cache(arg, check_count, unique_share) From 14ef1f23f8593dd63a81cf0f6dc6b531db32534b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 10 Jun 2019 15:10:43 +0300 Subject: [PATCH 10/20] fix bug when 'check_count' == 0 --- pandas/core/tools/datetimes.py | 9 ++++++--- pandas/tests/indexes/datetimes/test_tools.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0a84452e53d5d..7e6ed6321ef46 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -54,7 +54,7 @@ def should_cache(arg, check_count: int, unique_share: float): ---------- arg: listlike, tuple, 1-d array, Series check_count: int - 0 < check_count <= len(arg) + 0 <= check_count <= len(arg) unique_share: float 0 < unique_share < 1 @@ -62,10 +62,13 @@ def should_cache(arg, check_count: int, unique_share: float): ------- do_caching: bool """ - assert 0 < check_count <= len(arg), ('check_count must be in next bounds: ' - '(0; len(arg)]') + assert 0 <= check_count <= len(arg), ('check_count must be in next bounds:' + ' [0; len(arg)]') assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + if check_count == 0: + return False + do_caching = True unique_elements = unique(arg[:check_count]) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 690102d1be017..5d351b361a7a0 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2044,7 +2044,7 @@ def test_should_cache(listlike, do_caching): @pytest.mark.parametrize('check_count,unique_share, err_message', [ - (11, 0.5, r'check_count must be in next bounds: \(0; len\(arg\)]'), + (11, 0.5, r'check_count must be in next bounds: [0; len\(arg\)]'), (10, 2, r'unique_share must be in next bounds: \(0; 1\)') ]) def test_should_cache_errors(check_count, unique_share, err_message): From 0c2aaeaed1800c71324c6e8fc83b750327f715fd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 10 Jun 2019 15:50:53 +0300 Subject: [PATCH 11/20] added escape symbols --- pandas/tests/indexes/datetimes/test_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 5d351b361a7a0..44d6792d01b02 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2044,7 +2044,7 @@ def test_should_cache(listlike, do_caching): @pytest.mark.parametrize('check_count,unique_share, err_message', [ - (11, 0.5, r'check_count must be in next bounds: [0; len\(arg\)]'), + (11, 0.5, r'check_count must be in next bounds: \[0; len\(arg\)\]'), (10, 2, r'unique_share must be in next bounds: \(0; 1\)') ]) def test_should_cache_errors(check_count, unique_share, err_message): From 0fde7c8434a40e599b2ef3b1e41909a5d48b0ca8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 11 Jun 2019 17:11:41 +0300 Subject: [PATCH 12/20] attempt to improve perfomance --- pandas/core/tools/datetimes.py | 33 +++++++++++++------- pandas/tests/indexes/datetimes/test_tools.py | 8 ++--- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7e6ed6321ef46..85df2ddafb51c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -43,7 +43,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def should_cache(arg, check_count: int, unique_share: float): +def should_cache(arg, unique_share=0.7, check_count=None): """ Decides whether to do caching. @@ -53,23 +53,34 @@ def should_cache(arg, check_count: int, unique_share: float): Parameters ---------- arg: listlike, tuple, 1-d array, Series - check_count: int - 0 <= check_count <= len(arg) - unique_share: float + unique_share: float or None 0 < unique_share < 1 + check_count: int or None + 0 <= check_count <= len(arg) Returns ------- do_caching: bool """ - assert 0 <= check_count <= len(arg), ('check_count must be in next bounds:' - ' [0; len(arg)]') - assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + do_caching = True - if check_count == 0: - return False + # default realization + if check_count is None: + # in this case, the gain from caching is negligible + if len(arg) <= 50: + return False - do_caching = True + if len(arg) <= 5000: + check_count = int(len(arg) * 0.1) + else: + check_count = 500 + else: + assert 0 <= check_count <= len(arg), \ + 'check_count must be in next bounds: [0; len(arg)]' + assert 0 < unique_share < 1, \ + 'unique_share must be in next bounds: (0; 1)' + if check_count == 0: + return False unique_elements = unique(arg[:check_count]) if len(unique_elements) > check_count * unique_share: @@ -102,7 +113,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): # Perform a quicker unique check from pandas import Index - if not should_cache(arg, int(len(arg) * 0.1), 0.7): + if not should_cache(arg): return cache_array unique_dates = Index(arg).unique() diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 44d6792d01b02..784633b2512ce 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -2043,12 +2043,12 @@ def test_should_cache(listlike, do_caching): unique_share=0.7) == do_caching -@pytest.mark.parametrize('check_count,unique_share, err_message', [ - (11, 0.5, r'check_count must be in next bounds: \[0; len\(arg\)\]'), +@pytest.mark.parametrize('unique_share,check_count, err_message', [ + (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'), (10, 2, r'unique_share must be in next bounds: \(0; 1\)') ]) -def test_should_cache_errors(check_count, unique_share, err_message): +def test_should_cache_errors(unique_share, check_count, err_message): arg = [5] * 10 with pytest.raises(AssertionError, match=err_message): - tools.should_cache(arg, check_count, unique_share) + tools.should_cache(arg, unique_share, check_count) From c74597abe8021a3dbc96b6cf5f52abafc141b850 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 11 Jun 2019 18:18:27 +0300 Subject: [PATCH 13/20] fix bug --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 85df2ddafb51c..a9f8da84713ac 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -77,11 +77,11 @@ def should_cache(arg, unique_share=0.7, check_count=None): else: assert 0 <= check_count <= len(arg), \ 'check_count must be in next bounds: [0; len(arg)]' - assert 0 < unique_share < 1, \ - 'unique_share must be in next bounds: (0; 1)' if check_count == 0: return False + assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + unique_elements = unique(arg[:check_count]) if len(unique_elements) > check_count * unique_share: do_caching = False From 98e18a808d521c22a24b72b46f8de183fa9499ad Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 13 Jun 2019 14:19:28 +0300 Subject: [PATCH 14/20] fixed problems found by review --- pandas/core/tools/datetimes.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a9f8da84713ac..d3dc60fb08b38 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -12,7 +12,6 @@ from pandas._libs.tslibs.strptime import array_strptime from pandas.util._decorators import deprecate_kwarg -from pandas.core.algorithms import unique from pandas.core.dtypes.common import ( ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, @@ -23,6 +22,14 @@ from pandas._typing import ArrayLike from pandas.core import algorithms +from pandas.core.algorithms import unique + +# --------------------------------------------------------------------- +# types used in annotations + +ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries] + +# --------------------------------------------------------------------- # --------------------------------------------------------------------- # types used in annotations @@ -43,7 +50,8 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def should_cache(arg, unique_share=0.7, check_count=None): +def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, + check_count: Optional[int] = None) -> bool: """ Decides whether to do caching. @@ -53,14 +61,22 @@ def should_cache(arg, unique_share=0.7, check_count=None): Parameters ---------- arg: listlike, tuple, 1-d array, Series - unique_share: float or None + unique_share: float, default=0.7, optional 0 < unique_share < 1 - check_count: int or None + check_count: int, optional 0 <= check_count <= len(arg) Returns ------- do_caching: bool + + Notes + ----- + By default for a sequence of less than 50 items in size, we don't do + caching; for the number of elements less than 5000, we take ten percent of + all elements to check for a uniqueness share; if the sequence size is more + than 5000, then we check only the first 500 elements. + All constants were chosen empirically by. """ do_caching = True From 201e7fc5b6c24f5b3d2b0ed39969edf469ece552 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 3 Jul 2019 22:21:38 +0300 Subject: [PATCH 15/20] removed excess conversion 'to_numpy' --- pandas/core/tools/datetimes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d3dc60fb08b38..4f1bdbda25828 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -134,8 +134,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): unique_dates = Index(arg).unique() if len(unique_dates) < len(arg): - cache_dates = convert_listlike(unique_dates.to_numpy(), - True, format) + cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array From 209bb83f49e402d5e1d542be5d5c83637d194717 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jul 2019 16:56:53 -0400 Subject: [PATCH 16/20] fix up bencharks --- asv_bench/benchmarks/io/csv.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 6beb21883b5ab..bc12f4b266ac4 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -4,7 +4,6 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from pandas.io.parsers import _parser_defaults from io import StringIO from ..pandas_vb_common import BaseIO @@ -272,13 +271,12 @@ def setup(self, do_cache): self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): - # kwds setting here is used to avoid breaking tests in - # previous version of pandas, because this is api changes - kwds = {} - if 'cache_dates' in _parser_defaults: - kwds['cache_dates'] = do_cache - read_csv(self.data(self.StringIO_input), header=None, - parse_dates=[0], **kwds) + try: + read_csv(self.data(self.StringIO_input), header=None, + parse_dates=[0], cache_dates=do_cache) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass class ReadCSVMemoryGrowth(BaseIO): @@ -329,9 +327,14 @@ def setup(self, cache_dates): self.StringIO_input = StringIO(data) def time_read_csv_dayfirst(self, cache_dates): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, - dayfirst=True) + try: + read_csv(self.data(self.StringIO_input), sep=',', header=None, + names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, + dayfirst=True) + except TypeError: + # cache_dates is a new keyword in 0.25 + pass + def time_to_datetime_dayfirst(self, cache_dates): df = read_csv(self.data(self.StringIO_input), From 1c78f5045e855ecc026e6dd763965b889da7eaeb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 3 Jul 2019 15:58:41 -0500 Subject: [PATCH 17/20] lint --- asv_bench/benchmarks/io/csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index bc12f4b266ac4..de2d3b46eb448 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -329,7 +329,8 @@ def setup(self, cache_dates): def time_read_csv_dayfirst(self, cache_dates): try: read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, + names=['Date'], parse_dates=['Date'], + cache_dates=cache_dates, dayfirst=True) except TypeError: # cache_dates is a new keyword in 0.25 From 7dffc3e1f86303ef7570e39b9dcc67ae0f6ffbb2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jul 2019 17:38:29 -0400 Subject: [PATCH 18/20] lint --- asv_bench/benchmarks/io/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index de2d3b46eb448..fbb96380a5813 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -336,7 +336,6 @@ def time_read_csv_dayfirst(self, cache_dates): # cache_dates is a new keyword in 0.25 pass - def time_to_datetime_dayfirst(self, cache_dates): df = read_csv(self.data(self.StringIO_input), dtype={'date': str}, names=['date']) From 40114b7242dbaf09c99731adc295b07206129251 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jul 2019 19:24:35 -0400 Subject: [PATCH 19/20] patch for cache with bad dates --- pandas/core/tools/datetimes.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4f1bdbda25828..29e0712acc8bc 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -132,7 +132,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): if not should_cache(arg): return cache_array - unique_dates = Index(arg).unique() + unique_dates = Index(arg).unique().values if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b0c3944e0aff8..25589a1682f7a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser.read_csv(StringIO(data), parse_dates=(1,)) +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", [ + 'nan', '0', '']) +def test_bad_date_parse(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly + parser = all_parsers + s = StringIO(('%s,\n' % value) * 50000) + + parser.read_csv(s, + header=None, names=['foo', 'bar'], parse_dates=['foo'], + infer_datetime_format=False, + cache_dates=cache_dates) + + def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers From ebc8815fbde7728527bafb16efd2dfb86a4032bd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 3 Jul 2019 19:33:56 -0400 Subject: [PATCH 20/20] better unique --- pandas/core/tools/datetimes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 29e0712acc8bc..3e3318ed4c4b6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -127,12 +127,10 @@ def _maybe_cache(arg, format, cache, convert_listlike): cache_array = Series() if cache: # Perform a quicker unique check - from pandas import Index - if not should_cache(arg): return cache_array - unique_dates = Index(arg).unique().values + unique_dates = unique(arg) if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, True, format) cache_array = Series(cache_dates, index=unique_dates)