39
39
40
40
from pandas ._typing import ArrayLike
41
41
from pandas .core import algorithms
42
+ from pandas .core .algorithms import unique
43
+
44
+ # ---------------------------------------------------------------------
45
+ # types used in annotations
46
+
47
+ ArrayConvertible = Union [list , tuple , ArrayLike , ABCSeries ]
48
+
49
+ # ---------------------------------------------------------------------
42
50
43
51
# ---------------------------------------------------------------------
44
52
# types used in annotations
@@ -60,13 +68,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
60
68
return _guess_datetime_format (arr [non_nan_elements [0 ]], ** kwargs )
61
69
62
70
71
+ def should_cache (arg : ArrayConvertible , unique_share : float = 0.7 ,
72
+ check_count : Optional [int ] = None ) -> bool :
73
+ """
74
+ Decides whether to do caching.
75
+
76
+ If the percent of unique elements among `check_count` elements less
77
+ than `unique_share * 100` then we can do caching.
78
+
79
+ Parameters
80
+ ----------
81
+ arg: listlike, tuple, 1-d array, Series
82
+ unique_share: float, default=0.7, optional
83
+ 0 < unique_share < 1
84
+ check_count: int, optional
85
+ 0 <= check_count <= len(arg)
86
+
87
+ Returns
88
+ -------
89
+ do_caching: bool
90
+
91
+ Notes
92
+ -----
93
+ By default for a sequence of less than 50 items in size, we don't do
94
+ caching; for the number of elements less than 5000, we take ten percent of
95
+ all elements to check for a uniqueness share; if the sequence size is more
96
+ than 5000, then we check only the first 500 elements.
97
+ All constants were chosen empirically by.
98
+ """
99
+ do_caching = True
100
+
101
+ # default realization
102
+ if check_count is None :
103
+ # in this case, the gain from caching is negligible
104
+ if len (arg ) <= 50 :
105
+ return False
106
+
107
+ if len (arg ) <= 5000 :
108
+ check_count = int (len (arg ) * 0.1 )
109
+ else :
110
+ check_count = 500
111
+ else :
112
+ assert 0 <= check_count <= len (arg ), \
113
+ 'check_count must be in next bounds: [0; len(arg)]'
114
+ if check_count == 0 :
115
+ return False
116
+
117
+ assert 0 < unique_share < 1 , 'unique_share must be in next bounds: (0; 1)'
118
+
119
+ unique_elements = unique (arg [:check_count ])
120
+ if len (unique_elements ) > check_count * unique_share :
121
+ do_caching = False
122
+ return do_caching
123
+
124
+
63
125
def _maybe_cache (arg , format , cache , convert_listlike ):
64
126
"""
65
127
Create a cache of unique dates from an array of dates
66
128
67
129
Parameters
68
130
----------
69
- arg : integer, float, string, datetime, list , tuple, 1-d array, Series
131
+ arg : listlike , tuple, 1-d array, Series
70
132
format : string
71
133
Strftime format to parse time
72
134
cache : boolean
@@ -84,11 +146,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
84
146
cache_array = Series ()
85
147
if cache :
86
148
# Perform a quicker unique check
87
- from pandas import Index
149
+ if not should_cache (arg ):
150
+ return cache_array
88
151
89
- unique_dates = Index ( arg ). unique ()
152
+ unique_dates = unique (arg )
90
153
if len (unique_dates ) < len (arg ):
91
- cache_dates = convert_listlike (unique_dates . to_numpy () , True , format )
154
+ cache_dates = convert_listlike (unique_dates , True , format )
92
155
cache_array = Series (cache_dates , index = unique_dates )
93
156
return cache_array
94
157
@@ -491,21 +554,11 @@ def _adjust_to_origin(arg, origin, unit):
491
554
return arg
492
555
493
556
494
- @deprecate_kwarg (old_arg_name = "box" , new_arg_name = None )
495
- def to_datetime (
496
- arg ,
497
- errors = "raise" ,
498
- dayfirst = False ,
499
- yearfirst = False ,
500
- utc = None ,
501
- box = True ,
502
- format = None ,
503
- exact = True ,
504
- unit = None ,
505
- infer_datetime_format = False ,
506
- origin = "unix" ,
507
- cache = False ,
508
- ):
557
+ @deprecate_kwarg (old_arg_name = 'box' , new_arg_name = None )
558
+ def to_datetime (arg , errors = 'raise' , dayfirst = False , yearfirst = False ,
559
+ utc = None , box = True , format = None , exact = True ,
560
+ unit = None , infer_datetime_format = False , origin = 'unix' ,
561
+ cache = True ):
509
562
"""
510
563
Convert argument to datetime.
511
564
@@ -586,13 +639,16 @@ def to_datetime(
586
639
origin.
587
640
588
641
.. versionadded:: 0.20.0
589
- cache : boolean, default False
642
+ cache : boolean, default True
590
643
If True, use a cache of unique, converted dates to apply the datetime
591
644
conversion. May produce significant speed-up when parsing duplicate
592
645
date strings, especially ones with timezone offsets.
593
646
594
647
.. versionadded:: 0.23.0
595
648
649
+ .. versionchanged:: 0.25.0
650
+ - changed default value from False to True
651
+
596
652
Returns
597
653
-------
598
654
ret : datetime if parsing succeeded.
0 commit comments