22
22
23
23
from pandas ._typing import ArrayLike
24
24
from pandas .core import algorithms
25
+ from pandas .core .algorithms import unique
26
+
27
+ # ---------------------------------------------------------------------
28
+ # types used in annotations
29
+
30
+ ArrayConvertible = Union [list , tuple , ArrayLike , ABCSeries ]
31
+
32
+ # ---------------------------------------------------------------------
25
33
26
34
# ---------------------------------------------------------------------
27
35
# types used in annotations
@@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
42
50
return _guess_datetime_format (arr [non_nan_elements [0 ]], ** kwargs )
43
51
44
52
53
+ def should_cache (arg : ArrayConvertible , unique_share : float = 0.7 ,
54
+ check_count : Optional [int ] = None ) -> bool :
55
+ """
56
+ Decides whether to do caching.
57
+
58
+ If the percent of unique elements among `check_count` elements less
59
+ than `unique_share * 100` then we can do caching.
60
+
61
+ Parameters
62
+ ----------
63
+ arg: listlike, tuple, 1-d array, Series
64
+ unique_share: float, default=0.7, optional
65
+ 0 < unique_share < 1
66
+ check_count: int, optional
67
+ 0 <= check_count <= len(arg)
68
+
69
+ Returns
70
+ -------
71
+ do_caching: bool
72
+
73
+ Notes
74
+ -----
75
+ By default for a sequence of less than 50 items in size, we don't do
76
+ caching; for the number of elements less than 5000, we take ten percent of
77
+ all elements to check for a uniqueness share; if the sequence size is more
78
+ than 5000, then we check only the first 500 elements.
79
+ All constants were chosen empirically by.
80
+ """
81
+ do_caching = True
82
+
83
+ # default realization
84
+ if check_count is None :
85
+ # in this case, the gain from caching is negligible
86
+ if len (arg ) <= 50 :
87
+ return False
88
+
89
+ if len (arg ) <= 5000 :
90
+ check_count = int (len (arg ) * 0.1 )
91
+ else :
92
+ check_count = 500
93
+ else :
94
+ assert 0 <= check_count <= len (arg ), \
95
+ 'check_count must be in next bounds: [0; len(arg)]'
96
+ if check_count == 0 :
97
+ return False
98
+
99
+ assert 0 < unique_share < 1 , 'unique_share must be in next bounds: (0; 1)'
100
+
101
+ unique_elements = unique (arg [:check_count ])
102
+ if len (unique_elements ) > check_count * unique_share :
103
+ do_caching = False
104
+ return do_caching
105
+
106
+
45
107
def _maybe_cache (arg , format , cache , convert_listlike ):
46
108
"""
47
109
Create a cache of unique dates from an array of dates
48
110
49
111
Parameters
50
112
----------
51
- arg : integer, float, string, datetime, list , tuple, 1-d array, Series
113
+ arg : listlike , tuple, 1-d array, Series
52
114
format : string
53
115
Strftime format to parse time
54
116
cache : boolean
@@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
65
127
cache_array = Series ()
66
128
if cache :
67
129
# Perform a quicker unique check
68
- from pandas import Index
69
- unique_dates = Index (arg ).unique ()
130
+ if not should_cache (arg ):
131
+ return cache_array
132
+
133
+ unique_dates = unique (arg )
70
134
if len (unique_dates ) < len (arg ):
71
- cache_dates = convert_listlike (unique_dates .to_numpy (),
72
- True , format )
135
+ cache_dates = convert_listlike (unique_dates , True , format )
73
136
cache_array = Series (cache_dates , index = unique_dates )
74
137
return cache_array
75
138
@@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit):
448
511
def to_datetime (arg , errors = 'raise' , dayfirst = False , yearfirst = False ,
449
512
utc = None , box = True , format = None , exact = True ,
450
513
unit = None , infer_datetime_format = False , origin = 'unix' ,
451
- cache = False ):
514
+ cache = True ):
452
515
"""
453
516
Convert argument to datetime.
454
517
@@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
529
592
origin.
530
593
531
594
.. versionadded:: 0.20.0
532
- cache : boolean, default False
595
+ cache : boolean, default True
533
596
If True, use a cache of unique, converted dates to apply the datetime
534
597
conversion. May produce significant speed-up when parsing duplicate
535
598
date strings, especially ones with timezone offsets.
536
599
537
600
.. versionadded:: 0.23.0
538
601
602
+ .. versionchanged:: 0.25.0
603
+ - changed default value from False to True
604
+
539
605
Returns
540
606
-------
541
607
ret : datetime if parsing succeeded.
0 commit comments