PERF: Add cache keyword to to_datetime (pandas-dev#11665) (pandas-dev#17077)

mroeschke · No-Stream · commit 2267b976828d · 2017-11-28T15:15:14.000-08:00
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -346,17 +346,22 @@ class ToDatetime(object):
 
     def setup(self):
         self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
-        self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
+        self.stringsD = Series(self.rng.strftime('%Y%m%d'))
 
         self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
-        self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
-        self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
+        self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
+        self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
         self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
                                  for x in self.rng]
 
         self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
         self.s2 = self.s.str.replace(':\\S+$', '')
 
+        self.unique_numeric_seconds = range(10000)
+        self.dup_numeric_seconds = [1000] * 10000
+        self.dup_string_dates = ['2000-02-11'] * 10000
+        self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000
+
     def time_format_YYYYMMDD(self):
         to_datetime(self.stringsD, format='%Y%m%d')
 
@@ -381,6 +386,36 @@ def time_format_exact(self):
     def time_format_no_exact(self):
         to_datetime(self.s, format='%d%b%y', exact=False)
 
+    def time_cache_true_with_unique_seconds_and_unit(self):
+        to_datetime(self.unique_numeric_seconds, unit='s', cache=True)
+
+    def time_cache_false_with_unique_seconds_and_unit(self):
+        to_datetime(self.unique_numeric_seconds, unit='s', cache=False)
+
+    def time_cache_true_with_dup_seconds_and_unit(self):
+        to_datetime(self.dup_numeric_seconds, unit='s', cache=True)
+
+    def time_cache_false_with_dup_seconds_and_unit(self):
+        to_datetime(self.dup_numeric_seconds, unit='s', cache=False)
+
+    def time_cache_true_with_dup_string_dates(self):
+        to_datetime(self.dup_string_dates, cache=True)
+
+    def time_cache_false_with_dup_string_dates(self):
+        to_datetime(self.dup_string_dates, cache=False)
+
+    def time_cache_true_with_dup_string_dates_and_format(self):
+        to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=True)
+
+    def time_cache_false_with_dup_string_dates_and_format(self):
+        to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=False)
+
+    def time_cache_true_with_dup_string_tzoffset_dates(self):
+        to_datetime(self.dup_string_with_tz, cache=True)
+
+    def time_cache_false_with_dup_string_tzoffset_dates(self):
+        to_datetime(self.dup_string_with_tz, cache=False)
+
 
 class Offsets(object):
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -70,7 +70,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
--
+- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
 -
 
 .. _whatsnew_0220.docs:
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -36,9 +36,77 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 
+def _maybe_cache(arg, format, cache, tz, convert_listlike):
+    """
+    Create a cache of unique dates from an array of dates
+
+    Parameters
+    ----------
+    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+    format : string
+        Strftime format to parse time
+    cache : boolean
+        True attempts to create a cache of converted values
+    tz : string
+        Timezone of the dates
+    convert_listlike : function
+        Conversion function to apply on dates
+
+    Returns
+    -------
+    cache_array : Series
+        Cache of converted, unique dates. Can be empty
+    """
+    from pandas import Series
+    cache_array = Series()
+    if cache:
+        # Perform a quicker unique check
+        from pandas import Index
+        if not Index(arg).is_unique:
+            unique_dates = algorithms.unique(arg)
+            cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
+            cache_array = Series(cache_dates, index=unique_dates)
+    return cache_array
+
+
+def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
+    """
+    Convert array of dates with a cache and box the result
+
+    Parameters
+    ----------
+    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
+    cache_array : Series
+        Cache of converted, unique dates
+    box : boolean
+        True boxes result as an Index-like, False returns an ndarray
+    errors : string
+        'ignore' plus box=True will convert result to Index
+    name : string, default None
+        Name for a DatetimeIndex
+
+    Returns
+    -------
+    result : datetime of converted dates
+        Returns:
+
+        - Index-like if box=True
+        - ndarray if box=False
+    """
+    from pandas import Series, DatetimeIndex, Index
+    result = Series(arg).map(cache_array)
+    if box:
+        if errors == 'ignore':
+            return Index(result)
+        else:
+            return DatetimeIndex(result, name=name)
+    return result.values
+
+
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
                 utc=None, box=True, format=None, exact=True,
-                unit=None, infer_datetime_format=False, origin='unix'):
+                unit=None, infer_datetime_format=False, origin='unix',
+                cache=False):
     """
     Convert argument to datetime.
 
@@ -111,7 +179,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
           origin.
 
         .. versionadded: 0.20.0
+    cache : boolean, default False
+        If True, use a cache of unique, converted dates to apply the datetime
+        conversion. May produce sigificant speed-up when parsing duplicate date
+        strings, especially ones with timezone offsets.
 
+        .. versionadded: 0.22.0
     Returns
     -------
     ret : datetime if parsing succeeded.
@@ -369,15 +442,28 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
     if isinstance(arg, tslib.Timestamp):
         result = arg
     elif isinstance(arg, ABCSeries):
-        from pandas import Series
-        values = _convert_listlike(arg._values, True, format)
-        result = Series(values, index=arg.index, name=arg.name)
+        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
+        if not cache_array.empty:
+            result = arg.map(cache_array)
+        else:
+            from pandas import Series
+            values = _convert_listlike(arg._values, True, format)
+            result = Series(values, index=arg.index, name=arg.name)
     elif isinstance(arg, (ABCDataFrame, MutableMapping)):
         result = _assemble_from_unit_mappings(arg, errors=errors)
     elif isinstance(arg, ABCIndexClass):
-        result = _convert_listlike(arg, box, format, name=arg.name)
+        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
+        if not cache_array.empty:
+            result = _convert_and_box_cache(arg, cache_array, box, errors,
+                                            name=arg.name)
+        else:
+            result = _convert_listlike(arg, box, format, name=arg.name)
     elif is_list_like(arg):
-        result = _convert_listlike(arg, box, format)
+        cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike)
+        if not cache_array.empty:
+            result = _convert_and_box_cache(arg, cache_array, box, errors)
+        else:
+            result = _convert_listlike(arg, box, format)
     else:
         result = _convert_listlike(np.array([arg]), box, format)[0]
 
diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ Performance Improvements`
`70`	`70`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`71`	`71`
`72`	`72`	- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
`73`		`--`
	`73`	+- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
`74`	`74`	`-`
`75`	`75`
`76`	`76`	`.. _whatsnew_0220.docs:`