🗑️ deprecate infer_datetime_format, make strict

MarcoGorelli · MarcoGorelli · commit 1104a9276b40 · 2022-10-19T10:05:55.000+01:00
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -19,7 +19,10 @@
 
 import numpy as np
 
-from pandas._libs import tslib
+from pandas._libs import (
+    lib,
+    tslib,
+)
 from pandas._libs.tslibs import (
     OutOfBoundsDatetime,
     Timedelta,
@@ -331,7 +334,6 @@ def _convert_listlike_datetimes(
     tz: Timezone | None = None,
     unit: str | None = None,
     errors: DateTimeErrorChoices = "raise",
-    infer_datetime_format: bool = False,
     dayfirst: bool | None = None,
     yearfirst: bool | None = None,
     exact: bool = True,
@@ -415,27 +417,19 @@ def _convert_listlike_datetimes(
     arg = ensure_object(arg)
     require_iso8601 = False
 
-    if infer_datetime_format and format is None:
+    if format is None:
         format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
 
+    # There is a special fast-path for iso8601 formatted
+    # datetime strings, so in those cases don't use the inferred
+    # format because this path makes process slower in this
+    # special case
+    if format is not None and format_is_iso(format):
+        require_iso8601 = True
+        format = None
     if format is not None:
-        # There is a special fast-path for iso8601 formatted
-        # datetime strings, so in those cases don't use the inferred
-        # format because this path makes process slower in this
-        # special case
-        format_is_iso8601 = format_is_iso(format)
-        if format_is_iso8601:
-            require_iso8601 = not infer_datetime_format
-            format = None
-
-    if format is not None:
-        res = _to_datetime_with_format(
-            arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format
-        )
-        if res is not None:
-            return res
+        return _to_datetime_with_format(arg, orig_arg, name, tz, format, exact, errors)
 
-    assert format is None or infer_datetime_format
     utc = tz == "utc"
     result, tz_parsed = objects_to_datetime64ns(
         arg,
@@ -464,8 +458,7 @@ def _array_strptime_with_fallback(
     fmt: str,
     exact: bool,
     errors: str,
-    infer_datetime_format: bool,
-) -> Index | None:
+) -> Index:
     """
     Call array_strptime, with fallback behavior depending on 'errors'.
     """
@@ -486,18 +479,14 @@ def _array_strptime_with_fallback(
         # if fmt was inferred, try falling back
         # to array_to_datetime - terminate here
         # for specified formats
-        if not infer_datetime_format:
-            if errors == "raise":
-                raise
-            elif errors == "coerce":
-                result = np.empty(arg.shape, dtype="M8[ns]")
-                iresult = result.view("i8")
-                iresult.fill(iNaT)
-            else:
-                result = arg
+        if errors == "raise":
+            raise
+        elif errors == "coerce":
+            result = np.empty(arg.shape, dtype="M8[ns]")
+            iresult = result.view("i8")
+            iresult.fill(iNaT)
         else:
-            # Indicates to the caller to fallback to objects_to_datetime64ns
-            return None
+            result = arg
     else:
         if "%Z" in fmt or "%z" in fmt:
             return _return_parsed_timezone_results(result, timezones, tz, name)
@@ -513,10 +502,9 @@ def _to_datetime_with_format(
     fmt: str,
     exact: bool,
     errors: str,
-    infer_datetime_format: bool,
-) -> Index | None:
+) -> Index:
     """
-    Try parsing with the given format, returning None on failure.
+    Try parsing with the given format.
     """
     result = None
 
@@ -537,9 +525,7 @@ def _to_datetime_with_format(
             return _box_as_indexlike(result, utc=utc, name=name)
 
     # fallback
-    res = _array_strptime_with_fallback(
-        arg, name, tz, fmt, exact, errors, infer_datetime_format
-    )
+    res = _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors)
     return res
 
 
@@ -713,7 +699,7 @@ def to_datetime(
     format: str | None = None,
     exact: bool = True,
     unit: str | None = None,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: lib.NoDefault | bool = lib.no_default,
     origin: str = "unix",
     cache: bool = True,
 ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
@@ -926,24 +912,6 @@ def to_datetime(
     1   2016-03-05
     dtype: datetime64[ns]
 
-    Passing ``infer_datetime_format=True`` can often-times speedup a parsing
-    if its not an ISO8601 format exactly, but in a regular format.
-
-    >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)
-    >>> s.head()
-    0    3/11/2000
-    1    3/12/2000
-    2    3/13/2000
-    3    3/11/2000
-    4    3/12/2000
-    dtype: object
-
-    >>> %timeit pd.to_datetime(s, infer_datetime_format=True)  # doctest: +SKIP
-    100 loops, best of 3: 10.4 ms per loop
-
-    >>> %timeit pd.to_datetime(s, infer_datetime_format=False)  # doctest: +SKIP
-    1 loop, best of 3: 471 ms per loop
-
     Using a unix epoch time
 
     >>> pd.to_datetime(1490195805, unit='s')
@@ -1060,6 +1028,15 @@ def to_datetime(
                    '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'],
                   dtype='datetime64[ns, UTC]', freq=None)
     """
+    if infer_datetime_format is not lib.no_default:
+        warnings.warn(
+            "The argument 'infer_datetime_format' is deprecated and will "
+            "be removed in a future version. "
+            "A strict version of it is now the default, see "
+            "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
+            "You can safely remove this argument.",
+            stacklevel=find_stack_level(),
+        )
     if arg is None:
         return None
 
@@ -1075,7 +1052,6 @@ def to_datetime(
         yearfirst=yearfirst,
         errors=errors,
         exact=exact,
-        infer_datetime_format=infer_datetime_format,
     )
 
     result: Timestamp | NaTType | Series | Index
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -122,13 +122,11 @@ def __init__(self, kwds) -> None:
         self.true_values = kwds.get("true_values")
         self.false_values = kwds.get("false_values")
         self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
-        self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
         self.cache_dates = kwds.pop("cache_dates", True)
 
         self._date_conv = _make_date_converter(
             date_parser=self.date_parser,
             dayfirst=self.dayfirst,
-            infer_datetime_format=self.infer_datetime_format,
             cache_dates=self.cache_dates,
         )
 
@@ -1105,7 +1103,6 @@ def _get_empty_meta(
 def _make_date_converter(
     date_parser=None,
     dayfirst: bool = False,
-    infer_datetime_format: bool = False,
     cache_dates: bool = True,
 ):
     def converter(*date_cols):
@@ -1118,7 +1115,6 @@ def converter(*date_cols):
                     utc=None,
                     dayfirst=dayfirst,
                     errors="ignore",
-                    infer_datetime_format=infer_datetime_format,
                     cache=cache_dates,
                 ).to_numpy()
 
@@ -1188,7 +1184,6 @@ def converter(*date_cols):
     "squeeze": None,
     "compression": None,
     "mangle_dupe_cols": True,
-    "infer_datetime_format": False,
     "skip_blank_lines": True,
     "encoding_errors": "strict",
     "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -262,11 +262,6 @@
     :ref:`io.csv.mixed_timezones` for more.
 
     Note: A fast-path exists for iso8601-formatted dates.
-infer_datetime_format : bool, default False
-    If True and `parse_dates` is enabled, pandas will attempt to infer the
-    format of the datetime strings in the columns, and if it can be inferred,
-    switch to a faster method of parsing them. In some cases this can increase
-    the parsing speed by 5-10x.
 keep_date_col : bool, default False
     If True and `parse_dates` specifies combining multiple columns then
     keep the original columns.
@@ -483,7 +478,6 @@
     "decimal",
     "iterator",
     "dayfirst",
-    "infer_datetime_format",
     "verbose",
     "skipinitialspace",
     "low_memory",
@@ -648,7 +642,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -709,7 +703,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -770,7 +764,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -831,7 +825,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -905,7 +899,7 @@ def read_csv(
     skip_blank_lines: bool = True,
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] | None = None,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
     dayfirst: bool = False,
@@ -940,6 +934,15 @@ def read_csv(
     storage_options: StorageOptions = None,
     use_nullable_dtypes: bool = False,
 ) -> DataFrame | TextFileReader:
+    if infer_datetime_format is not lib.no_default:
+        warnings.warn(
+            "The argument 'infer_datetime_format' is deprecated and will "
+            "be removed in a future version. "
+            "A strict version of it is now the default, see "
+            "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
+            "You can safely remove this argument.",
+            stacklevel=find_stack_level(),
+        )
     # locals() should never be modified
     kwds = locals().copy()
     del kwds["filepath_or_buffer"]
@@ -992,7 +995,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1053,7 +1056,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1114,7 +1117,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1175,7 +1178,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1249,7 +1252,7 @@ def read_table(
     skip_blank_lines: bool = True,
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] = False,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
     dayfirst: bool = False,
@@ -1883,10 +1886,6 @@ def TextParser(*args, **kwds) -> TextFileReader:
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     squeeze : bool, default False
         returns Series if only one column.
-    infer_datetime_format: bool, default False
-        If True and `parse_dates` is True for a column, try to infer the
-        datetime format based on the first datetime string. If the format
-        can be inferred, there often will be a large parsing speed-up.
     float_precision : str, optional
         Specifies which converter the C engine should use for floating-point
         values. The options are `None` or `high` for the ordinary converter,