diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 779fc0bd20964..9614a63332609 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -346,17 +346,22 @@ class ToDatetime(object): def setup(self): self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) + self.stringsD = Series(self.rng.strftime('%Y%m%d')) self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist() + self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist() self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' for x in self.rng] self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) self.s2 = self.s.str.replace(':\\S+$', '') + self.unique_numeric_seconds = range(10000) + self.dup_numeric_seconds = [1000] * 10000 + self.dup_string_dates = ['2000-02-11'] * 10000 + self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000 + def time_format_YYYYMMDD(self): to_datetime(self.stringsD, format='%Y%m%d') @@ -381,6 +386,36 @@ def time_format_exact(self): def time_format_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) + def time_cache_true_with_unique_seconds_and_unit(self): + to_datetime(self.unique_numeric_seconds, unit='s', cache=True) + + def time_cache_false_with_unique_seconds_and_unit(self): + to_datetime(self.unique_numeric_seconds, unit='s', cache=False) + + def time_cache_true_with_dup_seconds_and_unit(self): + to_datetime(self.dup_numeric_seconds, unit='s', cache=True) + + def time_cache_false_with_dup_seconds_and_unit(self): + to_datetime(self.dup_numeric_seconds, unit='s', cache=False) + + def time_cache_true_with_dup_string_dates(self): + to_datetime(self.dup_string_dates, cache=True) + + def time_cache_false_with_dup_string_dates(self): + to_datetime(self.dup_string_dates, cache=False) + + def time_cache_true_with_dup_string_dates_and_format(self): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True) + + def time_cache_false_with_dup_string_dates_and_format(self): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False) + + def time_cache_true_with_dup_string_tzoffset_dates(self): + to_datetime(self.dup_string_with_tz, cache=True) + + def time_cache_false_with_dup_string_tzoffset_dates(self): + to_datetime(self.dup_string_with_tz, cache=False) + class Offsets(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 61679b14a8592..712119caae6f2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -70,7 +70,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) -- +- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - .. _whatsnew_0220.docs: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ae8aa275b2bae..19f7e459d0725 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -36,9 +36,77 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) +def _maybe_cache(arg, format, cache, tz, convert_listlike): + """ + Create a cache of unique dates from an array of dates + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + format : string + Strftime format to parse time + cache : boolean + True attempts to create a cache of converted values + tz : string + Timezone of the dates + convert_listlike : function + Conversion function to apply on dates + + Returns + ------- + cache_array : Series + Cache of converted, unique dates. Can be empty + """ + from pandas import Series + cache_array = Series() + if cache: + # Perform a quicker unique check + from pandas import Index + if not Index(arg).is_unique: + unique_dates = algorithms.unique(arg) + cache_dates = convert_listlike(unique_dates, True, format, tz=tz) + cache_array = Series(cache_dates, index=unique_dates) + return cache_array + + +def _convert_and_box_cache(arg, cache_array, box, errors, name=None): + """ + Convert array of dates with a cache and box the result + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + cache_array : Series + Cache of converted, unique dates + box : boolean + True boxes result as an Index-like, False returns an ndarray + errors : string + 'ignore' plus box=True will convert result to Index + name : string, default None + Name for a DatetimeIndex + + Returns + ------- + result : datetime of converted dates + Returns: + + - Index-like if box=True + - ndarray if box=False + """ + from pandas import Series, DatetimeIndex, Index + result = Series(arg).map(cache_array) + if box: + if errors == 'ignore': + return Index(result) + else: + return DatetimeIndex(result, name=name) + return result.values + + def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, - unit=None, infer_datetime_format=False, origin='unix'): + unit=None, infer_datetime_format=False, origin='unix', + cache=False): """ Convert argument to datetime. @@ -111,7 +179,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, origin. .. versionadded: 0.20.0 + cache : boolean, default False + If True, use a cache of unique, converted dates to apply the datetime + conversion. May produce sigificant speed-up when parsing duplicate date + strings, especially ones with timezone offsets. + .. versionadded: 0.22.0 Returns ------- ret : datetime if parsing succeeded. @@ -369,15 +442,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): - from pandas import Series - values = _convert_listlike(arg._values, True, format) - result = Series(values, index=arg.index, name=arg.name) + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = arg.map(cache_array) + else: + from pandas import Series + values = _convert_listlike(arg._values, True, format) + result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): - result = _convert_listlike(arg, box, format, name=arg.name) + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, box, errors, + name=arg.name) + else: + result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): - result = _convert_listlike(arg, box, format) + cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, box, errors) + else: + result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8205b4fde217b..307184cb34e27 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -28,7 +28,8 @@ class TestTimeConversionFormats(object): - def test_to_datetime_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format(self, cache): values = ['1/1/2000', '1/2/2000', '1/3/2000'] results1 = [Timestamp('20000101'), Timestamp('20000201'), @@ -43,7 +44,7 @@ def test_to_datetime_format(self): (values[2], (results1[2], results2[2]))]: for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): - result = to_datetime(vals, format=fmt) + result = to_datetime(vals, format=fmt, cache=cache) expected = expecteds[i] if isinstance(expected, Series): @@ -53,14 +54,15 @@ def test_to_datetime_format(self): else: tm.assert_index_equal(result, expected) - def test_to_datetime_format_YYYYMMDD(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_YYYYMMDD(self, cache): s = Series([19801222, 19801222] + [19810105] * 5) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) - result = to_datetime(s.apply(str), format='%Y%m%d') + result = to_datetime(s.apply(str), format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # with NaT @@ -69,44 +71,48 @@ def test_to_datetime_format_YYYYMMDD(self): expected[2] = np.nan s[2] = np.nan - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # string with NaT s = s.apply(str) s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d') + result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) # coercion # GH 7930 s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') + result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', + cache=cache) expected = Series([datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], dtype=object) tm.assert_series_equal(result, expected) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') assert_series_equal(result, expected) - # GH 10178 - def test_to_datetime_format_integer(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_integer(self, cache): + # GH 10178 s = Series([2000, 2001, 2002]) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y') + result = to_datetime(s, format='%Y', cache=cache) assert_series_equal(result, expected) s = Series([200001, 200105, 200206]) expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) ]) - result = to_datetime(s, format='%Y%m') + result = to_datetime(s, format='%Y%m', cache=cache) assert_series_equal(result, expected) - def test_to_datetime_format_microsecond(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_microsecond(self, cache): # these are locale dependent lang, _ = locale.getlocale() @@ -114,11 +120,12 @@ def test_to_datetime_format_microsecond(self): val = '01-{}-2011 00:00:01.978'.format(month_abbr) format = '%d-%b-%Y %H:%M:%S.%f' - result = to_datetime(val, format=format) + result = to_datetime(val, format=format, cache=cache) exp = datetime.strptime(val, format) assert result == exp - def test_to_datetime_format_time(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_time(self, cache): data = [ ['01/10/2010 15:20', '%m/%d/%Y %H:%M', Timestamp('2010-01-10 15:20')], @@ -134,9 +141,10 @@ def test_to_datetime_format_time(self): # Timestamp('2010-01-10 09:12:56')] ] for s, format, dt in data: - assert to_datetime(s, format=format) == dt + assert to_datetime(s, format=format, cache=cache) == dt - def test_to_datetime_with_non_exact(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_non_exact(self, cache): # GH 10834 tm._skip_if_has_locale() @@ -147,12 +155,13 @@ def test_to_datetime_with_non_exact(self): s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False) + result = to_datetime(s, format='%d%b%y', exact=False, cache=cache) expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y') + format='%d%b%y', cache=cache) assert_series_equal(result, expected) - def test_parse_nanoseconds_with_formula(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parse_nanoseconds_with_formula(self, cache): # GH8989 # trunctaing the nanoseconds when a format was provided @@ -161,44 +170,48 @@ def test_parse_nanoseconds_with_formula(self): "2012-01-01 09:00:00.001", "2012-01-01 09:00:00.001000", "2012-01-01 09:00:00.001000000", ]: - expected = pd.to_datetime(v) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f") + expected = pd.to_datetime(v, cache=cache) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", + cache=cache) assert result == expected - def test_to_datetime_format_weeks(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_weeks(self, cache): data = [ ['2009324', '%Y%W%w', Timestamp('2009-08-13')], ['2013020', '%Y%U%w', Timestamp('2013-01-13')] ] for s, format, dt in data: - assert to_datetime(s, format=format) == dt + assert to_datetime(s, format=format, cache=cache) == dt class TestToDatetime(object): - def test_to_datetime_dt64s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_dt64s(self, cache): in_bound_dts = [ np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] for dt in in_bound_dts: - assert pd.to_datetime(dt) == Timestamp(dt) + assert pd.to_datetime(dt, cache=cache) == Timestamp(dt) oob_dts = [np.datetime64('1000-01-01'), np.datetime64('5000-01-02'), ] for dt in oob_dts: pytest.raises(ValueError, pd.to_datetime, dt, errors='raise') pytest.raises(ValueError, Timestamp, dt) - assert pd.to_datetime(dt, errors='coerce') is NaT + assert pd.to_datetime(dt, errors='coerce', cache=cache) is NaT - def test_to_datetime_array_of_dt64s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_array_of_dt64s(self, cache): dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing tm.assert_numpy_array_equal( - pd.to_datetime(dts, box=False), + pd.to_datetime(dts, box=False, cache=cache), np.array([Timestamp(x).asm8 for x in dts]) ) @@ -209,7 +222,8 @@ def test_to_datetime_array_of_dt64s(self): errors='raise') tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce'), + pd.to_datetime(dts_with_oob, box=False, errors='coerce', + cache=cache), np.array( [ Timestamp(dts_with_oob[0]).asm8, @@ -224,20 +238,22 @@ def test_to_datetime_array_of_dt64s(self): # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='ignore'), + pd.to_datetime(dts_with_oob, box=False, errors='ignore', + cache=cache), np.array( [dt.item() for dt in dts_with_oob], dtype='O' ) ) - def test_to_datetime_tz(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] - result = pd.to_datetime(arr) + result = pd.to_datetime(arr, cache=cache) expected = DatetimeIndex( ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') tm.assert_index_equal(result, expected) @@ -245,9 +261,10 @@ def test_to_datetime_tz(self): # mixed tzs will raise arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - pytest.raises(ValueError, lambda: pd.to_datetime(arr)) + pytest.raises(ValueError, lambda: pd.to_datetime(arr, cache=cache)) - def test_to_datetime_tz_pytz(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz_pytz(self, cache): # see gh-8260 us_eastern = pytz.timezone('US/Eastern') arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, @@ -255,18 +272,20 @@ def test_to_datetime_tz_pytz(self): us_eastern.localize(datetime(year=2000, month=6, day=1, hour=3, minute=0))], dtype=object) - result = pd.to_datetime(arr, utc=True) + result = pd.to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('cache', [True, False]) @pytest.mark.parametrize("init_constructor, end_constructor, test_method", [(Index, DatetimeIndex, tm.assert_index_equal), (list, DatetimeIndex, tm.assert_index_equal), (np.array, DatetimeIndex, tm.assert_index_equal), (Series, Series, tm.assert_series_equal)]) def test_to_datetime_utc_true(self, + cache, init_constructor, end_constructor, test_method): @@ -277,39 +296,47 @@ def test_to_datetime_utc_true(self, result = pd.to_datetime(init_constructor(data), format='%Y%m%d %H%M%S', - utc=True) + utc=True, + cache=cache) expected = end_constructor(expected_data) test_method(result, expected) # Test scalar case as well for scalar, expected in zip(data, expected_data): - result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True) + result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True, + cache=cache) assert result == expected - def test_to_datetime_utc_true_with_series_single_value(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 - result = pd.to_datetime(pd.Series([ts]), utc=True) + result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) expected = pd.Series([pd.Timestamp(ts, tz='utc')]) tm.assert_series_equal(result, expected) - def test_to_datetime_utc_true_with_series_tzaware_string(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): ts = '2013-01-01 00:00:00-01:00' expected_ts = '2013-01-01 01:00:00' data = pd.Series([ts] * 3) - result = pd.to_datetime(data, utc=True) + result = pd.to_datetime(data, utc=True, cache=cache) expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('cache', [True, False]) @pytest.mark.parametrize('date, dtype', [('2013-01-01 01:00:00', 'datetime64[ns]'), ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')]) - def test_to_datetime_utc_true_with_series_datetime_ns(self, date, dtype): + def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, + dtype): expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')]) - result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, + cache=cache) tm.assert_series_equal(result, expected) - def test_to_datetime_tz_psycopg2(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 try: @@ -324,7 +351,7 @@ def test_to_datetime_tz_psycopg2(self): datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], dtype=object) - result = pd.to_datetime(arr, errors='coerce', utc=True) + result = pd.to_datetime(arr, errors='coerce', utc=True, cache=cache) expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) @@ -337,32 +364,39 @@ def test_to_datetime_tz_psycopg2(self): assert is_datetime64_ns_dtype(i) # tz coerceion - result = pd.to_datetime(i, errors='coerce') + result = pd.to_datetime(i, errors='coerce', cache=cache) tm.assert_index_equal(result, i) - result = pd.to_datetime(i, errors='coerce', utc=True) + result = pd.to_datetime(i, errors='coerce', utc=True, cache=cache) expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) - def test_datetime_bool(self): + @pytest.mark.parametrize( + 'cache', + [pytest.param(True, + marks=pytest.mark.skipif(True, reason="GH 18111")), + False]) + def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): to_datetime(False) - assert to_datetime(False, errors="coerce") is NaT - assert to_datetime(False, errors="ignore") is False + assert to_datetime(False, errors="coerce", cache=cache) is NaT + assert to_datetime(False, errors="ignore", cache=cache) is False with pytest.raises(TypeError): to_datetime(True) - assert to_datetime(True, errors="coerce") is NaT - assert to_datetime(True, errors="ignore") is True + assert to_datetime(True, errors="coerce", cache=cache) is NaT + assert to_datetime(True, errors="ignore", cache=cache) is True with pytest.raises(TypeError): - to_datetime([False, datetime.today()]) + to_datetime([False, datetime.today()], cache=cache) with pytest.raises(TypeError): - to_datetime(['20130101', True]) + to_datetime(['20130101', True], cache=cache) tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], - errors="coerce"), - DatetimeIndex([to_datetime(0), NaT, - NaT, to_datetime(0)])) + errors="coerce", cache=cache), + DatetimeIndex([to_datetime(0, cache=cache), + NaT, + NaT, + to_datetime(0, cache=cache)])) def test_datetime_invalid_datatype(self): # GH13176 @@ -372,6 +406,39 @@ def test_datetime_invalid_datatype(self): with pytest.raises(TypeError): pd.to_datetime(pd.to_datetime) + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("box", [True, False]) + @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) + def test_to_datetime_cache(self, utc, format, box, constructor): + date = '20130101 00:00:00' + test_dates = [date] * 10**5 + data = constructor(test_dates) + result = pd.to_datetime(data, utc=utc, format=format, box=box, + cache=True) + expected = pd.to_datetime(data, utc=utc, format=format, box=box, + cache=False) + if box: + tm.assert_index_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("utc", [True, None]) + @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + def test_to_datetime_cache_series(self, utc, format): + date = '20130101 00:00:00' + test_dates = [date] * 10**5 + data = pd.Series(test_dates) + result = pd.to_datetime(data, utc=utc, format=format, cache=True) + expected = pd.to_datetime(data, utc=utc, format=format, cache=False) + tm.assert_series_equal(result, expected) + + def test_to_datetime_cache_scalar(self): + date = '20130101 00:00:00' + result = pd.to_datetime(date, cache=True) + expected = pd.Timestamp('20130101 00:00:00') + assert result == expected + @pytest.mark.parametrize('date, format', [('2017-20', '%Y-%W'), ('20 Sunday', '%W %A'), @@ -388,72 +455,77 @@ def test_week_without_day_and_calendar_year(self, date, format): class TestToDatetimeUnit(object): - - def test_unit(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit(self, cache): # GH 11758 # test proper behavior with erros with pytest.raises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') + to_datetime([1], unit='D', format='%Y%m%d', cache=cache) values = [11111111, 1, 1.0, tslib.iNaT, NaT, np.nan, 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') + result = to_datetime(values, unit='D', errors='ignore', cache=cache) expected = Index([11111111, Timestamp('1970-01-02'), Timestamp('1970-01-02'), NaT, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, unit='D', errors='coerce') + result = to_datetime(values, unit='D', errors='coerce', cache=cache) expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') + to_datetime(values, unit='D', errors='raise', cache=cache) values = [1420043460000, tslib.iNaT, NaT, np.nan, 'NaT'] - result = to_datetime(values, errors='ignore', unit='s') + result = to_datetime(values, errors='ignore', unit='s', cache=cache) expected = Index([1420043460000, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, errors='coerce', unit='s') + result = to_datetime(values, errors='coerce', unit='s', cache=cache) expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') + to_datetime(values, errors='raise', unit='s', cache=cache) # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime for val in ['foo', Timestamp('20130101')]: try: - to_datetime(val, errors='raise', unit='s') + to_datetime(val, errors='raise', unit='s', cache=cache) except tslib.OutOfBoundsDatetime: raise AssertionError("incorrect exception raised") except ValueError: pass - def test_unit_consistency(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_consistency(self, cache): # consistency of conversions expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') + result = pd.to_datetime(11111111, unit='s', errors='raise', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='coerce') + result = pd.to_datetime(11111111, unit='s', errors='coerce', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='ignore') + result = pd.to_datetime(11111111, unit='s', errors='ignore', + cache=cache) assert result == expected assert isinstance(result, Timestamp) - def test_unit_with_numeric(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_with_numeric(self, cache): # GH 13180 # coercions from floats/ints are ok @@ -462,10 +534,10 @@ def test_unit_with_numeric(self): arr1 = [1.434692e+18, 1.432766e+18] arr2 = np.array(arr1).astype('int64') for errors in ['ignore', 'raise', 'coerce']: - result = pd.to_datetime(arr1, errors=errors) + result = pd.to_datetime(arr1, errors=errors, cache=cache) tm.assert_index_equal(result, expected) - result = pd.to_datetime(arr2, errors=errors) + result = pd.to_datetime(arr2, errors=errors, cache=cache) tm.assert_index_equal(result, expected) # but we want to make sure that we are coercing @@ -474,7 +546,7 @@ def test_unit_with_numeric(self): '2015-06-19 05:33:20', '2015-05-27 22:33:20']) arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) expected = DatetimeIndex(['2015-06-19 05:33:20', @@ -482,31 +554,33 @@ def test_unit_with_numeric(self): 'NaT', 'NaT']) arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) - def test_unit_mixed(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_mixed(self, cache): # mixed integers/datetimes expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, errors='raise', cache=cache) expected = DatetimeIndex(['NaT', 'NaT', '2013-01-01']) arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce') + result = pd.to_datetime(arr, errors='coerce', cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise') + pd.to_datetime(arr, errors='raise', cache=cache) - def test_dataframe(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dataframe(self, cache): df = DataFrame({'year': [2015, 2016], 'month': [2, 3], @@ -520,19 +594,20 @@ def test_dataframe(self): result = to_datetime({'year': df['year'], 'month': df['month'], - 'day': df['day']}) + 'day': df['day']}, cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:0:00')]) assert_series_equal(result, expected) # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict()) + result = to_datetime(df[['year', 'month', 'day']].to_dict(), + cache=cache) assert_series_equal(result, expected) # dict but with constructable df2 = df[['year', 'month', 'day']].to_dict() df2['month'] = 2 - result = to_datetime(df2) + result = to_datetime(df2, cache=cache) expected2 = Series([Timestamp('20150204 00:00:00'), Timestamp('20160205 00:0:00')]) assert_series_equal(result, expected2) @@ -553,7 +628,8 @@ def test_dataframe(self): ] for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d)) + result = to_datetime(df[list(d.keys())].rename(columns=d), + cache=cache) expected = Series([Timestamp('20150204 06:58:10'), Timestamp('20160305 07:59:11')]) assert_series_equal(result, expected) @@ -568,13 +644,13 @@ def test_dataframe(self): 'us': 'us', 'ns': 'ns'} - result = to_datetime(df.rename(columns=d)) + result = to_datetime(df.rename(columns=d), cache=cache) expected = Series([Timestamp('20150204 06:58:10.001002003'), Timestamp('20160305 07:59:11.001002003')]) assert_series_equal(result, expected) # coerce back to int - result = to_datetime(df.astype(str)) + result = to_datetime(df.astype(str), cache=cache) assert_series_equal(result, expected) # passing coerce @@ -585,8 +661,8 @@ def test_dataframe(self): msg = ("cannot assemble the datetimes: time data .+ does not " "match format '%Y%m%d' \(match\)") with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) - result = to_datetime(df2, errors='coerce') + to_datetime(df2, cache=cache) + result = to_datetime(df2, errors='coerce', cache=cache) expected = Series([Timestamp('20150204 00:00:00'), NaT]) assert_series_equal(result, expected) @@ -597,7 +673,7 @@ def test_dataframe(self): with tm.assert_raises_regex(ValueError, msg): df2 = df.copy() df2['foo'] = 1 - to_datetime(df2) + to_datetime(df2, cache=cache) # not enough msg = ('to assemble mappings requires at least that \[year, month, ' @@ -608,7 +684,7 @@ def test_dataframe(self): ['month', 'day'], ['year', 'day', 'second']]: with tm.assert_raises_regex(ValueError, msg): - to_datetime(df[c]) + to_datetime(df[c], cache=cache) # duplicates msg = 'cannot assemble with duplicate keys' @@ -617,7 +693,7 @@ def test_dataframe(self): 'day': [4, 5]}) df2.columns = ['year', 'year', 'day'] with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) + to_datetime(df2, cache=cache) df2 = DataFrame({'year': [2015, 2016], 'month': [2, 20], @@ -625,16 +701,17 @@ def test_dataframe(self): 'hour': [4, 5]}) df2.columns = ['year', 'month', 'day', 'day'] with tm.assert_raises_regex(ValueError, msg): - to_datetime(df2) + to_datetime(df2, cache=cache) - def test_dataframe_dtypes(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dataframe_dtypes(self, cache): # #13451 df = DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) # int16 - result = to_datetime(df.astype('int16')) + result = to_datetime(df.astype('int16'), cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:00:00')]) assert_series_equal(result, expected) @@ -642,7 +719,7 @@ def test_dataframe_dtypes(self): # mixed dtypes df['month'] = df['month'].astype('int8') df['day'] = df['day'].astype('int8') - result = to_datetime(df) + result = to_datetime(df, cache=cache) expected = Series([Timestamp('20150204 00:00:00'), Timestamp('20160305 00:00:00')]) assert_series_equal(result, expected) @@ -652,18 +729,19 @@ def test_dataframe_dtypes(self): 'month': [1.5, 1], 'day': [1, 1]}) with pytest.raises(ValueError): - to_datetime(df) + to_datetime(df, cache=cache) class TestToDatetimeMisc(object): - def test_index_to_datetime(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_index_to_datetime(self, cache): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = idx.to_datetime() - expected = DatetimeIndex(pd.to_datetime(idx.values)) + expected = DatetimeIndex(pd.to_datetime(idx.values, cache=cache)) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning, @@ -674,17 +752,19 @@ def test_index_to_datetime(self): expected = DatetimeIndex([today]) tm.assert_index_equal(result, expected) - def test_to_datetime_iso8601(self): - result = to_datetime(["2012-01-01 00:00:00"]) + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_iso8601(self, cache): + result = to_datetime(["2012-01-01 00:00:00"], cache=cache) exp = Timestamp("2012-01-01 00:00:00") assert result[0] == exp - result = to_datetime(['20121001']) # bad iso 8601 + result = to_datetime(['20121001'], cache=cache) # bad iso 8601 exp = Timestamp('2012-10-01') assert result[0] == exp - def test_to_datetime_default(self): - rs = to_datetime('2001') + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_default(self, cache): + rs = to_datetime('2001', cache=cache) xp = datetime(2001, 1, 1) assert rs == xp @@ -694,71 +774,80 @@ def test_to_datetime_default(self): # pytest.raises(ValueError, to_datetime('01-13-2012', # dayfirst=True)) - def test_to_datetime_on_datetime64_series(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_on_datetime64_series(self, cache): # #2699 s = Series(date_range('1/1/2000', periods=10)) - result = to_datetime(s) + result = to_datetime(s, cache=cache) assert result[0] == s[0] - def test_to_datetime_with_space_in_series(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_space_in_series(self, cache): # GH 6428 s = Series(['10/18/2006', '10/18/2008', ' ']) - pytest.raises(ValueError, lambda: to_datetime(s, errors='raise')) - result_coerce = to_datetime(s, errors='coerce') + pytest.raises(ValueError, lambda: to_datetime(s, + errors='raise', + cache=cache)) + result_coerce = to_datetime(s, errors='coerce', cache=cache) expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore') + result_ignore = to_datetime(s, errors='ignore', cache=cache) tm.assert_series_equal(result_ignore, s) - def test_to_datetime_with_apply(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales tm._skip_if_has_locale() # GH 5195 # with a format and coerce a single item to_datetime fails td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y') - result = td.apply(pd.to_datetime, format='%b %y') + expected = pd.to_datetime(td, format='%b %y', cache=cache) + result = td.apply(pd.to_datetime, format='%b %y', cache=cache) assert_series_equal(result, expected) td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) pytest.raises(ValueError, lambda: pd.to_datetime(td, format='%b %y', - errors='raise')) + errors='raise', + cache=cache)) pytest.raises(ValueError, lambda: td.apply(pd.to_datetime, format='%b %y', - errors='raise')) - expected = pd.to_datetime(td, format='%b %y', errors='coerce') + errors='raise', cache=cache)) + expected = pd.to_datetime(td, format='%b %y', errors='coerce', + cache=cache) result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce')) + lambda x: pd.to_datetime(x, format='%b %y', errors='coerce', + cache=cache)) assert_series_equal(result, expected) - def test_to_datetime_types(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_types(self, cache): # empty string - result = to_datetime('') + result = to_datetime('', cache=cache) assert result is NaT - result = to_datetime(['', '']) + result = to_datetime(['', ''], cache=cache) assert isna(result).all() # ints result = Timestamp(0) - expected = to_datetime(0) + expected = to_datetime(0, cache=cache) assert result == expected # GH 3888 (strings) - expected = to_datetime(['2012'])[0] - result = to_datetime('2012') + expected = to_datetime(['2012'], cache=cache)[0] + result = to_datetime('2012', cache=cache) assert result == expected # array = ['2012','20120101','20120101 12:01:01'] array = ['20120101', '20120101 12:01:01'] - expected = list(to_datetime(array)) + expected = list(to_datetime(array, cache=cache)) result = lmap(Timestamp, array) tm.assert_almost_equal(result, expected) @@ -767,13 +856,15 @@ def test_to_datetime_types(self): # expected = to_datetime('2012') # assert result == expected - def test_to_datetime_unprocessable_input(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_unprocessable_input(self, cache): # GH 4928 tm.assert_numpy_array_equal( - to_datetime([1, '1'], errors='ignore'), + to_datetime([1, '1'], errors='ignore', cache=cache), np.array([1, '1'], dtype='O') ) - pytest.raises(TypeError, to_datetime, [1, '1'], errors='raise') + pytest.raises(TypeError, to_datetime, [1, '1'], errors='raise', + cache=cache) def test_to_datetime_other_datetime64_units(self): # 5/25/2012 @@ -809,7 +900,8 @@ def test_to_datetime_overflow(self): with pytest.raises(OverflowError): date_range(start='1/1/1700', freq='B', periods=100000) - def test_string_na_nat_conversion(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_string_na_nat_conversion(self, cache): # GH #999, #858 from pandas.compat import parse_date @@ -827,7 +919,7 @@ def test_string_na_nat_conversion(self): result = tslib.array_to_datetime(strings) tm.assert_almost_equal(result, expected) - result2 = to_datetime(strings) + result2 = to_datetime(strings, cache=cache) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) @@ -835,22 +927,25 @@ def test_string_na_nat_conversion(self): # GH 10636, default is now 'raise' pytest.raises(ValueError, - lambda: to_datetime(malformed, errors='raise')) + lambda: to_datetime(malformed, errors='raise', + cache=cache)) - result = to_datetime(malformed, errors='ignore') + result = to_datetime(malformed, errors='ignore', cache=cache) tm.assert_numpy_array_equal(result, malformed) - pytest.raises(ValueError, to_datetime, malformed, errors='raise') + pytest.raises(ValueError, to_datetime, malformed, errors='raise', + cache=cache) idx = ['a', 'b', 'c', 'd', 'e'] series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000'), np.nan, - to_datetime('1/3/2000'), np.nan, - to_datetime('1/5/2000')], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000', cache=cache), np.nan, + to_datetime('1/3/2000', cache=cache), np.nan, + to_datetime('1/5/2000', cache=cache)], + index=idx, name='foo') - result = to_datetime(series) - dresult = to_datetime(dseries) + result = to_datetime(series, cache=cache) + dresult = to_datetime(dseries, cache=cache) expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) for i in range(5): @@ -858,7 +953,7 @@ def test_string_na_nat_conversion(self): if isna(x): expected[i] = tslib.iNaT else: - expected[i] = to_datetime(x) + expected[i] = to_datetime(x, cache=cache) assert_series_equal(result, expected, check_names=False) assert result.name == 'foo' @@ -866,26 +961,29 @@ def test_string_na_nat_conversion(self): assert_series_equal(dresult, expected, check_names=False) assert dresult.name == 'foo' - def test_dti_constructor_numpy_timeunits(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dti_constructor_numpy_timeunits(self, cache): # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'], + cache=cache) for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: values = base.values.astype(dtype) tm.assert_index_equal(DatetimeIndex(values), base) - tm.assert_index_equal(to_datetime(values), base) + tm.assert_index_equal(to_datetime(values, cache=cache), base) - def test_dayfirst(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_dayfirst(self, cache): # GH 5917 arr = ['10/02/2014', '11/02/2014', '12/02/2014'] expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)]) idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) - idx3 = to_datetime(arr, dayfirst=True) - idx4 = to_datetime(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True, cache=cache) + idx4 = to_datetime(np.array(arr), dayfirst=True, cache=cache) idx5 = DatetimeIndex(Index(arr), dayfirst=True) idx6 = DatetimeIndex(Series(arr), dayfirst=True) tm.assert_index_equal(expected, idx1) @@ -920,7 +1018,8 @@ def test_guess_datetime_format_for_array(self): class TestToDatetimeInferFormat(object): - def test_to_datetime_infer_datetime_format_consistent_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_consistent_format(self, cache): s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', @@ -929,90 +1028,113 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): for test_format in test_formats: s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + with_format = pd.to_datetime(s_as_dt_strings, format=test_format, + cache=cache) no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False) + infer_datetime_format=False, + cache=cache) yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True) + infer_datetime_format=True, + cache=cache) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same tm.assert_series_equal(with_format, no_infer) tm.assert_series_equal(no_infer, yes_infer) - def test_to_datetime_infer_datetime_format_inconsistent_format(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, + cache): s = pd.Series(np.array(['01/01/2011 00:00:00', '01-02-2011 00:00:00', '2011-01-03T00:00:00'])) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) - def test_to_datetime_infer_datetime_format_series_with_nans(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) - - def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) + + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_infer_datetime_format_series_start_with_nans(self, + cache): s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), - pd.to_datetime(s, infer_datetime_format=True)) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, + cache=cache), + pd.to_datetime(s, infer_datetime_format=True, + cache=cache)) - def test_to_datetime_iso8601_noleading_0s(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) expected = pd.Series([pd.Timestamp('2014-01-01'), pd.Timestamp('2014-02-02'), pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(s), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) + tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d', + cache=cache), expected) class TestDaysInMonth(object): # tests for issue #10154 - def test_day_not_in_month_coerce(self): - assert isna(to_datetime('2015-02-29', errors='coerce')) + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_coerce(self, cache): + assert isna(to_datetime('2015-02-29', errors='coerce', cache=cache)) assert isna(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) assert isna(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) assert isna(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce')) + errors='coerce', cache=cache)) - def test_day_not_in_month_raise(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_raise(self, cache): pytest.raises(ValueError, to_datetime, '2015-02-29', - errors='raise') + errors='raise', cache=cache) pytest.raises(ValueError, to_datetime, '2015-02-29', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) pytest.raises(ValueError, to_datetime, '2015-02-32', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) pytest.raises(ValueError, to_datetime, '2015-04-31', - errors='raise', format="%Y-%m-%d") + errors='raise', format="%Y-%m-%d", cache=cache) - def test_day_not_in_month_ignore(self): - assert to_datetime('2015-02-29', errors='ignore') == '2015-02-29' + @pytest.mark.parametrize('cache', [True, False]) + def test_day_not_in_month_ignore(self, cache): + assert to_datetime('2015-02-29', errors='ignore', + cache=cache) == '2015-02-29' assert to_datetime('2015-02-29', errors='ignore', - format="%Y-%m-%d") == '2015-02-29' + format="%Y-%m-%d", cache=cache) == '2015-02-29' assert to_datetime('2015-02-32', errors='ignore', - format="%Y-%m-%d") == '2015-02-32' + format="%Y-%m-%d", cache=cache) == '2015-02-32' assert to_datetime('2015-04-31', errors='ignore', - format="%Y-%m-%d") == '2015-04-31' + format="%Y-%m-%d", cache=cache) == '2015-04-31' class TestDatetimeParsingWrappers(object): - def test_parsers(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers(self, cache): # https://github.com/dateutil/dateutil/issues/217 import dateutil @@ -1076,7 +1198,7 @@ def test_parsers(self): result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), - yearfirst=yearfirst) + yearfirst=yearfirst, cache=cache) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -1106,7 +1228,8 @@ def test_parsers(self): assert result3 is tslib.NaT assert result4 is tslib.NaT - def test_parsers_dayfirst_yearfirst(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_dayfirst_yearfirst(self, cache): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 # 2.5.2 10-11-12 [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00 @@ -1190,7 +1313,7 @@ def test_parsers_dayfirst_yearfirst(self): assert result2 == expected result3 = to_datetime(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) + yearfirst=yearfirst, cache=cache) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] @@ -1199,7 +1322,8 @@ def test_parsers_dayfirst_yearfirst(self): assert result3 == expected assert result4 == expected - def test_parsers_timestring(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_timestring(self, cache): # must be the same as dateutil result cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)), '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))} @@ -1254,9 +1378,10 @@ def test_parsers_time(self): assert isinstance(res, list) assert res == expected_arr - def test_parsers_timezone_minute_offsets_roundtrip(self): + @pytest.mark.parametrize('cache', [True, False]) + def test_parsers_timezone_minute_offsets_roundtrip(self, cache): # GH11708 - base = to_datetime("2013-01-01 00:00:00") + base = to_datetime("2013-01-01 00:00:00", cache=cache) dt_strings = [ ('2013-01-01 05:45+0545', "Asia/Katmandu", @@ -1267,7 +1392,7 @@ def test_parsers_timezone_minute_offsets_roundtrip(self): ] for dt_string, tz, dt_string_repr in dt_strings: - dt_time = to_datetime(dt_string) + dt_time = to_datetime(dt_string, cache=cache) assert base == dt_time converted_time = dt_time.tz_localize('UTC').tz_convert(tz) assert dt_string_repr == repr(converted_time)