Skip to content

Commit 130406e

Browse files
committed
Allow pd.unique to handle tuples, added test to to_datetime
Refactor tests and add doc notes Add whatsnew and some pep8 changes
1 parent 13b57cd commit 130406e

File tree

5 files changed

+99
-7
lines changed

5 files changed

+99
-7
lines changed

doc/source/whatsnew/v0.21.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ Other Enhancements
7878
- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
7979
- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
8080
- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
81+
- :func:`to_datetime` now accepts a `cache_datetime` keyword which allows for faster parsing of duplicate dates. (:issue:`11665`)
82+
8183

8284
.. _whatsnew_0210.api_breaking:
8385

pandas/_libs/lib.pyx

+17
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,23 @@ cpdef ndarray[object] list_to_object_array(list obj):
373373
return arr
374374

375375

376+
@cython.wraparound(False)
377+
@cython.boundscheck(False)
378+
cpdef ndarray[object] tuple_to_object_array(tuple obj):
379+
"""
380+
Convert list to object ndarray. Seriously can\'t believe
381+
I had to write this function.
382+
"""
383+
cdef:
384+
Py_ssize_t i, n = len(obj)
385+
ndarray[object] arr = np.empty(n, dtype=object)
386+
387+
for i in range(n):
388+
arr[i] = obj[i]
389+
390+
return arr
391+
392+
376393
@cython.wraparound(False)
377394
@cython.boundscheck(False)
378395
def fast_unique(ndarray[object] values):

pandas/core/algorithms.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,10 @@ def _ensure_arraylike(values):
170170
ABCIndexClass, ABCSeries)):
171171
inferred = lib.infer_dtype(values)
172172
if inferred in ['mixed', 'string', 'unicode']:
173-
values = lib.list_to_object_array(values)
173+
if isinstance(values, tuple):
174+
values = lib.tuple_to_object_array(values)
175+
else:
176+
values = lib.list_to_object_array(values)
174177
else:
175178
values = np.asarray(values)
176179
return values

pandas/core/tools/datetimes.py

+37-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import datetime, timedelta, time
22
import numpy as np
3+
import pandas as pd
34
from collections import MutableMapping
45

56
from pandas._libs import lib, tslib
@@ -183,7 +184,8 @@ def _guess_datetime_format_for_array(arr, **kwargs):
183184

184185
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
185186
utc=None, box=True, format=None, exact=True,
186-
unit=None, infer_datetime_format=False, origin='unix'):
187+
unit=None, infer_datetime_format=False, origin='unix',
188+
cache_datetime=False):
187189
"""
188190
Convert argument to datetime.
189191
@@ -257,6 +259,10 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
257259
258260
.. versionadded: 0.20.0
259261
262+
cache_datetime : boolean, default False
263+
If True, use a cache of unique, converted dates to apply the datetime
264+
conversion. Produces signficant speed-ups when parsing duplicate dates
265+
260266
Returns
261267
-------
262268
ret : datetime if parsing succeeded.
@@ -340,6 +346,19 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
340346

341347
tz = 'utc' if utc else None
342348

349+
cache = None
350+
if (cache_datetime and is_list_like(arg) and
351+
not isinstance(arg, DatetimeIndex)):
352+
# No need to convert with a cache if the arg is already a DatetimeIndex
353+
unique_dates = pd.unique(arg)
354+
if len(unique_dates) != len(arg):
355+
cache = {d: pd.to_datetime(d, errors=errors, dayfirst=dayfirst,
356+
yearfirst=yearfirst, utc=utc, box=box, format=format,
357+
exact=exact, unit=unit,
358+
infer_datetime_format=infer_datetime_format,
359+
origin=origin, cache_datetime=False)
360+
for d in unique_dates}
361+
343362
def _convert_listlike(arg, box, format, name=None, tz=tz):
344363

345364
if isinstance(arg, (list, tuple)):
@@ -505,15 +524,27 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
505524
if isinstance(arg, tslib.Timestamp):
506525
result = arg
507526
elif isinstance(arg, ABCSeries):
508-
from pandas import Series
509-
values = _convert_listlike(arg._values, False, format)
510-
result = Series(values, index=arg.index, name=arg.name)
527+
if cache:
528+
result = arg.map(cache)
529+
else:
530+
values = _convert_listlike(arg._values, False, format)
531+
result = pd.Series(values, index=arg.index, name=arg.name)
511532
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
512533
result = _assemble_from_unit_mappings(arg, errors=errors)
513534
elif isinstance(arg, ABCIndexClass):
514-
result = _convert_listlike(arg, box, format, name=arg.name)
535+
if cache:
536+
result = pd.Series(arg.values).map(cache).values
537+
if box:
538+
result = DatetimeIndex(result, tz=tz, name=arg.name)
539+
else:
540+
result = _convert_listlike(arg, box, format, name=arg.name)
515541
elif is_list_like(arg):
516-
result = _convert_listlike(arg, box, format)
542+
if cache:
543+
result = pd.Series(arg).map(cache).values
544+
if box:
545+
result = DatetimeIndex(result, tz=tz)
546+
else:
547+
result = _convert_listlike(arg, box, format)
517548
else:
518549
result = _convert_listlike(np.array([arg]), box, format)[0]
519550

pandas/tests/indexes/datetimes/test_tools.py

+39
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,45 @@ def test_to_datetime_tz_psycopg2(self):
306306
dtype='datetime64[ns, UTC]')
307307
tm.assert_index_equal(result, expected)
308308

309+
@pytest.mark.parametrize("box", [True, False])
310+
@pytest.mark.parametrize("utc", [True, None])
311+
@pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None])
312+
def test_to_datetime_cache_datetime(self, box, utc, format):
313+
# GH 11665
314+
test_dates = ['20130101 00:00:00'] * 10
315+
# Test Index results
316+
test_inputs = [test_dates, tuple(test_dates), np.array(test_dates),
317+
pd.Index(test_dates)]
318+
for test_input in test_inputs:
319+
expected = pd.to_datetime(test_dates,
320+
box=box,
321+
utc=utc,
322+
format=format)
323+
result = pd.to_datetime(test_dates,
324+
box=box,
325+
utc=utc,
326+
format=format,
327+
cache_datetime=True)
328+
if box:
329+
tm.assert_index_equal(result, expected)
330+
else:
331+
tm.assert_numpy_array_equal(result, expected)
332+
# Test Series result
333+
expected = pd.to_datetime(pd.Series(test_dates),
334+
utc=utc,
335+
format=format)
336+
result = pd.to_datetime(pd.Series(test_dates),
337+
utc=utc,
338+
format=format,
339+
cache_datetime=True)
340+
tm.assert_series_equal(expected, result)
341+
# Test Scalar result: cache_datetime=True should not affect conversion
342+
test_date = '20130101 00:00:00'
343+
expected = pd.Timestamp('20130101 00:00:00')
344+
for scalar_result in [test_date, pd.Timestamp(test_date)]:
345+
result = pd.to_datetime(scalar_result, cache_datetime=True)
346+
assert result == expected
347+
309348
def test_datetime_bool(self):
310349
# GH13176
311350
with pytest.raises(TypeError):

0 commit comments

Comments
 (0)