Skip to content

PERF: changed default value of cache parameter to True in to_datetime function #26043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Jul 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime
from pandas.io.parsers import _parser_defaults
from io import StringIO

from ..pandas_vb_common import BaseIO
Expand Down Expand Up @@ -272,13 +271,12 @@ def setup(self, do_cache):
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
# kwds setting here is used to avoid breaking tests in
# previous version of pandas, because this is api changes
kwds = {}
if 'cache_dates' in _parser_defaults:
kwds['cache_dates'] = do_cache
read_csv(self.data(self.StringIO_input), header=None,
parse_dates=[0], **kwds)
try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomAugspurger ok method of handling ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although... I worry it would incorrectly catch a TypeError in the function? The other way might be to check pandas.__version__?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, let me see what i can do

read_csv(self.data(self.StringIO_input), header=None,
parse_dates=[0], cache_dates=do_cache)
except TypeError:
# cache_dates is a new keyword in 0.25
pass


class ReadCSVMemoryGrowth(BaseIO):
Expand Down Expand Up @@ -329,9 +327,14 @@ def setup(self, cache_dates):
self.StringIO_input = StringIO(data)

def time_read_csv_dayfirst(self, cache_dates):
read_csv(self.data(self.StringIO_input), sep=',', header=None,
names=['Date'], parse_dates=['Date'], cache_dates=cache_dates,
dayfirst=True)
try:
read_csv(self.data(self.StringIO_input), sep=',', header=None,
names=['Date'], parse_dates=['Date'],
cache_dates=cache_dates,
dayfirst=True)
except TypeError:
# cache_dates is a new keyword in 0.25
pass

def time_to_datetime_dayfirst(self, cache_dates):
df = read_csv(self.data(self.StringIO_input),
Expand Down
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,19 @@ def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format='%Y%m%d')


class ToDatetimeCacheSmallCount(object):

params = ([True, False], [50, 500, 5000, 100000])
param_names = ['cache', 'count']

def setup(self, cache, count):
rng = date_range(start='1/1/1971', periods=count)
self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()

def time_unique_date_strings(self, cache, count):
to_datetime(self.unique_date_strings, cache=cache)


class ToDatetimeISO8601:

def setup(self):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,7 @@ Performance improvements
- Restored performance of :meth:`DatetimeIndex.__iter__` by re-enabling specialized code path (:issue:`26702`)
- Improved performance when building :class:`MultiIndex` with at least one :class:`CategoricalIndex` level (:issue:`22044`)
- Improved performance by removing the need for a garbage collect when checking for ``SettingWithCopyWarning`` (:issue:`27031`)
- For :meth:`to_datetime` changed default value of cache parameter to ``True`` (:issue:`26043`)

.. _whatsnew_0250.bug_fixes:

Expand Down
80 changes: 73 additions & 7 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@

from pandas._typing import ArrayLike
from pandas.core import algorithms
from pandas.core.algorithms import unique

# ---------------------------------------------------------------------
# types used in annotations

ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries]

# ---------------------------------------------------------------------

# ---------------------------------------------------------------------
# types used in annotations
Expand All @@ -42,13 +50,67 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)


def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
check_count: Optional[int] = None) -> bool:
"""
Decides whether to do caching.

If the percent of unique elements among `check_count` elements less
than `unique_share * 100` then we can do caching.

Parameters
----------
arg: listlike, tuple, 1-d array, Series
unique_share: float, default=0.7, optional
0 < unique_share < 1
check_count: int, optional
0 <= check_count <= len(arg)

Returns
-------
do_caching: bool

Notes
-----
By default for a sequence of less than 50 items in size, we don't do
caching; for the number of elements less than 5000, we take ten percent of
all elements to check for a uniqueness share; if the sequence size is more
than 5000, then we check only the first 500 elements.
All constants were chosen empirically by.
"""
do_caching = True

# default realization
if check_count is None:
# in this case, the gain from caching is negligible
if len(arg) <= 50:
return False

if len(arg) <= 5000:
check_count = int(len(arg) * 0.1)
else:
check_count = 500
else:
assert 0 <= check_count <= len(arg), \
'check_count must be in next bounds: [0; len(arg)]'
if check_count == 0:
return False

assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'

unique_elements = unique(arg[:check_count])
if len(unique_elements) > check_count * unique_share:
do_caching = False
return do_caching


def _maybe_cache(arg, format, cache, convert_listlike):
"""
Create a cache of unique dates from an array of dates

Parameters
----------
arg : integer, float, string, datetime, list, tuple, 1-d array, Series
arg : listlike, tuple, 1-d array, Series
format : string
Strftime format to parse time
cache : boolean
Expand All @@ -65,11 +127,12 @@ def _maybe_cache(arg, format, cache, convert_listlike):
cache_array = Series()
if cache:
# Perform a quicker unique check
from pandas import Index
unique_dates = Index(arg).unique()
if not should_cache(arg):
return cache_array

unique_dates = unique(arg)
if len(unique_dates) < len(arg):
cache_dates = convert_listlike(unique_dates.to_numpy(),
True, format)
cache_dates = convert_listlike(unique_dates, True, format)
cache_array = Series(cache_dates, index=unique_dates)
return cache_array

Expand Down Expand Up @@ -448,7 +511,7 @@ def _adjust_to_origin(arg, origin, unit):
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False, origin='unix',
cache=False):
cache=True):
"""
Convert argument to datetime.

Expand Down Expand Up @@ -529,13 +592,16 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.

.. versionadded:: 0.20.0
cache : boolean, default False
cache : boolean, default True
If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.

.. versionadded:: 0.23.0

.. versionchanged:: 0.25.0
- changed default value from False to True

Returns
-------
ret : datetime if parsing succeeded.
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,3 +2032,23 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
result = to_datetime([arg], unit='ns', utc=utc)
expected = to_datetime([exp])
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize('listlike,do_caching', [
([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)
])
def test_should_cache(listlike, do_caching):
assert tools.should_cache(listlike, check_count=len(listlike),
unique_share=0.7) == do_caching


@pytest.mark.parametrize('unique_share,check_count, err_message', [
(0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'),
(10, 2, r'unique_share must be in next bounds: \(0; 1\)')
])
def test_should_cache_errors(unique_share, check_count, err_message):
arg = [5] * 10

with pytest.raises(AssertionError, match=err_message):
tools.should_cache(arg, unique_share, check_count)
15 changes: 15 additions & 0 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,21 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
parser.read_csv(StringIO(data), parse_dates=(1,))


@pytest.mark.parametrize("cache_dates", [True, False])
@pytest.mark.parametrize("value", [
'nan', '0', ''])
def test_bad_date_parse(all_parsers, cache_dates, value):
# if we have an invalid date make sure that we handle this with
# and w/o the cache properly
parser = all_parsers
s = StringIO(('%s,\n' % value) * 50000)

parser.read_csv(s,
header=None, names=['foo', 'bar'], parse_dates=['foo'],
infer_datetime_format=False,
cache_dates=cache_dates)


def test_parse_dates_empty_string(all_parsers):
# see gh-2263
parser = all_parsers
Expand Down