Skip to content

Commit 21b4eee

Browse files
committed
Merge remote-tracking branch 'upstream/master' into jorisvandenbossche-blacken
2 parents 2b7e27e + b3d3ce7 commit 21b4eee

File tree

8 files changed

+142
-34
lines changed

8 files changed

+142
-34
lines changed

asv_bench/benchmarks/io/csv.py

+14-11
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import numpy as np
55
import pandas.util.testing as tm
66
from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
7-
from pandas.io.parsers import _parser_defaults
87
from io import StringIO
98

109
from ..pandas_vb_common import BaseIO
@@ -272,13 +271,12 @@ def setup(self, do_cache):
272271
self.StringIO_input = StringIO(data)
273272

274273
def time_read_csv_cached(self, do_cache):
275-
# kwds setting here is used to avoid breaking tests in
276-
# previous version of pandas, because this is api changes
277-
kwds = {}
278-
if 'cache_dates' in _parser_defaults:
279-
kwds['cache_dates'] = do_cache
280-
read_csv(self.data(self.StringIO_input), header=None,
281-
parse_dates=[0], **kwds)
274+
try:
275+
read_csv(self.data(self.StringIO_input), header=None,
276+
parse_dates=[0], cache_dates=do_cache)
277+
except TypeError:
278+
# cache_dates is a new keyword in 0.25
279+
pass
282280

283281

284282
class ReadCSVMemoryGrowth(BaseIO):
@@ -329,9 +327,14 @@ def setup(self, cache_dates):
329327
self.StringIO_input = StringIO(data)
330328

331329
def time_read_csv_dayfirst(self, cache_dates):
332-
read_csv(self.data(self.StringIO_input), sep=',', header=None,
333-
names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
334-
dayfirst=True)
330+
try:
331+
read_csv(self.data(self.StringIO_input), sep=',', header=None,
332+
names=['Date'], parse_dates=['Date'],
333+
cache_dates=cache_dates,
334+
dayfirst=True)
335+
except TypeError:
336+
# cache_dates is a new keyword in 0.25
337+
pass
335338

336339
def time_to_datetime_dayfirst(self, cache_dates):
337340
df = read_csv(self.data(self.StringIO_input),

asv_bench/benchmarks/timeseries.py

+13
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
300300
to_datetime(self.stringsD, format='%Y%m%d')
301301

302302

303+
class ToDatetimeCacheSmallCount(object):
304+
305+
params = ([True, False], [50, 500, 5000, 100000])
306+
param_names = ['cache', 'count']
307+
308+
def setup(self, cache, count):
309+
rng = date_range(start='1/1/1971', periods=count)
310+
self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()
311+
312+
def time_unique_date_strings(self, cache, count):
313+
to_datetime(self.unique_date_strings, cache=cache)
314+
315+
303316
class ToDatetimeISO8601:
304317

305318
def setup(self):

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,7 @@ Performance improvements
937937
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
938938
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
939939
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
940+
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)
940941
941942
.. _whatsnew_0250.bug_fixes:
942943

pandas/core/indexes/multi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,7 @@ def _shallow_copy(self, values=None, **kwargs):
995995
# discards freq
996996
kwargs.pop("freq", None)
997997
return MultiIndex.from_tuples(values, names=names, **kwargs)
998-
return self.view()
998+
return self.copy(**kwargs)
999999

10001000
@cache_readonly
10011001
def dtype(self):
@@ -1923,7 +1923,7 @@ def remove_unused_levels(self):
19231923
new_levels.append(lev)
19241924
new_codes.append(level_codes)
19251925

1926-
result = self._shallow_copy()
1926+
result = self.view()
19271927

19281928
if changed:
19291929
result._reset_identity()

pandas/core/tools/datetimes.py

+76-20
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@
3939

4040
from pandas._typing import ArrayLike
4141
from pandas.core import algorithms
42+
from pandas.core.algorithms import unique
43+
44+
# ---------------------------------------------------------------------
45+
# types used in annotations
46+
47+
ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]
48+
49+
# ---------------------------------------------------------------------
4250

4351
# ---------------------------------------------------------------------
4452
# types used in annotations
@@ -60,13 +68,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
6068
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
6169

6270

71+
def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
72+
check_count: Optional[int] = None) -> bool:
73+
"""
74+
Decides whether to do caching.
75+
76+
If the percent of unique elements among `check_count` elements less
77+
than `unique_share * 100` then we can do caching.
78+
79+
Parameters
80+
----------
81+
arg: listlike, tuple, 1-d array, Series
82+
unique_share: float, default=0.7, optional
83+
0 < unique_share < 1
84+
check_count: int, optional
85+
0 <= check_count <= len(arg)
86+
87+
Returns
88+
-------
89+
do_caching: bool
90+
91+
Notes
92+
-----
93+
By default for a sequence of less than 50 items in size, we don't do
94+
caching; for the number of elements less than 5000, we take ten percent of
95+
all elements to check for a uniqueness share; if the sequence size is more
96+
than 5000, then we check only the first 500 elements.
97+
All constants were chosen empirically by.
98+
"""
99+
do_caching = True
100+
101+
# default realization
102+
if check_count is None:
103+
# in this case, the gain from caching is negligible
104+
if len(arg) <= 50:
105+
return False
106+
107+
if len(arg) <= 5000:
108+
check_count = int(len(arg) * 0.1)
109+
else:
110+
check_count = 500
111+
else:
112+
assert 0 <= check_count <= len(arg), \
113+
'check_count must be in next bounds: [0; len(arg)]'
114+
if check_count == 0:
115+
return False
116+
117+
assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
118+
119+
unique_elements = unique(arg[:check_count])
120+
if len(unique_elements) > check_count * unique_share:
121+
do_caching = False
122+
return do_caching
123+
124+
63125
def _maybe_cache(arg, format, cache, convert_listlike):
64126
"""
65127
Create a cache of unique dates from an array of dates
66128
67129
Parameters
68130
----------
69-
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
131+
arg : listlike, tuple, 1-d array, Series
70132
format : string
71133
Strftime format to parse time
72134
cache : boolean
@@ -84,11 +146,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
84146
cache_array = Series()
85147
if cache:
86148
# Perform a quicker unique check
87-
from pandas import Index
149+
if not should_cache(arg):
150+
return cache_array
88151

89-
unique_dates = Index(arg).unique()
152+
unique_dates = unique(arg)
90153
if len(unique_dates) < len(arg):
91-
cache_dates = convert_listlike(unique_dates.to_numpy(), True, format)
154+
cache_dates = convert_listlike(unique_dates, True, format)
92155
cache_array = Series(cache_dates, index=unique_dates)
93156
return cache_array
94157

@@ -491,21 +554,11 @@ def _adjust_to_origin(arg, origin, unit):
491554
return arg
492555

493556

494-
@deprecate_kwarg(old_arg_name="box", new_arg_name=None)
495-
def to_datetime(
496-
arg,
497-
errors="raise",
498-
dayfirst=False,
499-
yearfirst=False,
500-
utc=None,
501-
box=True,
502-
format=None,
503-
exact=True,
504-
unit=None,
505-
infer_datetime_format=False,
506-
origin="unix",
507-
cache=False,
508-
):
557+
@deprecate_kwarg(old_arg_name='box', new_arg_name=None)
558+
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
559+
utc=None, box=True, format=None, exact=True,
560+
unit=None, infer_datetime_format=False, origin='unix',
561+
cache=True):
509562
"""
510563
Convert argument to datetime.
511564
@@ -586,13 +639,16 @@ def to_datetime(
586639
origin.
587640
588641
.. versionadded:: 0.20.0
589-
cache : boolean, default False
642+
cache : boolean, default True
590643
If True, use a cache of unique, converted dates to apply the datetime
591644
conversion. May produce significant speed-up when parsing duplicate
592645
date strings, especially ones with timezone offsets.
593646
594647
.. versionadded:: 0.23.0
595648
649+
.. versionchanged:: 0.25.0
650+
- changed default value from False to True
651+
596652
Returns
597653
-------
598654
ret : datetime if parsing succeeded.

pandas/tests/indexes/datetimes/test_tools.py

+20
Original file line numberDiff line numberDiff line change
@@ -2241,3 +2241,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
22412241
result = to_datetime([arg], unit="ns", utc=utc)
22422242
expected = to_datetime([exp])
22432243
tm.assert_index_equal(result, expected)
2244+
2245+
2246+
@pytest.mark.parametrize('listlike,do_caching', [
2247+
([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
2248+
([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
2249+
])
2250+
def test_should_cache(listlike, do_caching):
2251+
assert tools.should_cache(listlike, check_count=len(listlike),
2252+
unique_share=0.7) == do_caching
2253+
2254+
2255+
@pytest.mark.parametrize('unique_share,check_count, err_message', [
2256+
(0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
2257+
(10, 2, r'unique_share must be in next bounds: \(0; 1\)')
2258+
])
2259+
def test_should_cache_errors(unique_share, check_count, err_message):
2260+
arg = [5] * 10
2261+
2262+
with pytest.raises(AssertionError, match=err_message):
2263+
tools.should_cache(arg, unique_share, check_count)

pandas/tests/indexes/multi/test_equivalence.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def test_is_():
177177
assert mi2.is_(mi)
178178
assert mi.is_(mi2)
179179

180-
assert mi.is_(mi.set_names(["C", "D"]))
180+
assert not mi.is_(mi.set_names(["C", "D"]))
181181
mi2 = mi.view()
182182
mi2.set_names(["E", "F"], inplace=True)
183183
assert mi.is_(mi2)

pandas/tests/io/parser/test_parse_dates.py

+15
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
10951095
parser.read_csv(StringIO(data), parse_dates=(1,))
10961096

10971097

1098+
@pytest.mark.parametrize("cache_dates", [True, False])
1099+
@pytest.mark.parametrize("value", [
1100+
'nan', '0', ''])
1101+
def test_bad_date_parse(all_parsers, cache_dates, value):
1102+
# if we have an invalid date make sure that we handle this with
1103+
# and w/o the cache properly
1104+
parser = all_parsers
1105+
s = StringIO(('%s,\n' % value) * 50000)
1106+
1107+
parser.read_csv(s,
1108+
header=None, names=['foo', 'bar'], parse_dates=['foo'],
1109+
infer_datetime_format=False,
1110+
cache_dates=cache_dates)
1111+
1112+
10981113
def test_parse_dates_empty_string(all_parsers):
10991114
# see gh-2263
11001115
parser = all_parsers

0 commit comments

Comments
 (0)