remove infer_datetime_format

MarcoGorelli · MarcoGorelli · commit d8b3730183b3 · 2022-10-10T14:24:23.000+01:00
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -20,7 +20,10 @@
 
 import numpy as np
 
-from pandas._libs import tslib
+from pandas._libs import (
+    lib,
+    tslib,
+)
 from pandas._libs.tslibs import (
     OutOfBoundsDatetime,
     Timedelta,
@@ -706,7 +709,7 @@ def to_datetime(
     format: str | None = None,
     exact: bool = True,
     unit: str | None = None,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: lib.NoDefault | bool = lib.no_default,
     origin: str = "unix",
     cache: bool = True,
 ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
@@ -1053,6 +1056,14 @@ def to_datetime(
                    '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'],
                   dtype='datetime64[ns, UTC]', freq=None)
     """
+    if infer_datetime_format is not lib.no_default:
+        # Kept for compatibility with old code - TODO remove
+        warnings.warn(
+            "The argument 'infer_datetime_format' has been removed - a strict version "
+            "of it is now the default, see "
+            "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html",
+            stacklevel=find_stack_level(inspect.currentframe()),
+        )
     if arg is None:
         return None
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -123,13 +123,11 @@ def __init__(self, kwds) -> None:
         self.true_values = kwds.get("true_values")
         self.false_values = kwds.get("false_values")
         self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
-        self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
         self.cache_dates = kwds.pop("cache_dates", True)
 
         self._date_conv = _make_date_converter(
             date_parser=self.date_parser,
             dayfirst=self.dayfirst,
-            infer_datetime_format=self.infer_datetime_format,
             cache_dates=self.cache_dates,
         )
 
@@ -1106,7 +1104,6 @@ def _get_empty_meta(
 def _make_date_converter(
     date_parser=None,
     dayfirst: bool = False,
-    infer_datetime_format: bool = False,
     cache_dates: bool = True,
 ):
     def converter(*date_cols):
@@ -1119,7 +1116,6 @@ def converter(*date_cols):
                     utc=None,
                     dayfirst=dayfirst,
                     errors="ignore",
-                    infer_datetime_format=infer_datetime_format,
                     cache=cache_dates,
                 ).to_numpy()
 
@@ -1189,7 +1185,6 @@ def converter(*date_cols):
     "squeeze": None,
     "compression": None,
     "mangle_dupe_cols": True,
-    "infer_datetime_format": False,
     "skip_blank_lines": True,
     "encoding_errors": "strict",
     "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -263,11 +263,6 @@
     :ref:`io.csv.mixed_timezones` for more.
 
     Note: A fast-path exists for iso8601-formatted dates.
-infer_datetime_format : bool, default False
-    If True and `parse_dates` is enabled, pandas will attempt to infer the
-    format of the datetime strings in the columns, and if it can be inferred,
-    switch to a faster method of parsing them. In some cases this can increase
-    the parsing speed by 5-10x.
 keep_date_col : bool, default False
     If True and `parse_dates` specifies combining multiple columns then
     keep the original columns.
@@ -484,7 +479,6 @@
     "decimal",
     "iterator",
     "dayfirst",
-    "infer_datetime_format",
     "verbose",
     "skipinitialspace",
     "low_memory",
@@ -649,7 +643,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -710,7 +704,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -771,7 +765,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -832,7 +826,7 @@ def read_csv(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] | None = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -906,7 +900,7 @@ def read_csv(
     skip_blank_lines: bool = True,
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] | None = None,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
     dayfirst: bool = False,
@@ -941,6 +935,14 @@ def read_csv(
     storage_options: StorageOptions = None,
     use_nullable_dtypes: bool = False,
 ) -> DataFrame | TextFileReader:
+    if infer_datetime_format is not lib.no_default:
+        # Kept for compatibility with old code - TODO remove
+        warnings.warn(
+            "The argument 'infer_datetime_format' has been removed - a strict version "
+            "of it is now the default, see "
+            "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html",
+            stacklevel=find_stack_level(inspect.currentframe()),
+        )
     # locals() should never be modified
     kwds = locals().copy()
     del kwds["filepath_or_buffer"]
@@ -993,7 +995,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1054,7 +1056,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1115,7 +1117,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1176,7 +1178,7 @@ def read_table(
     verbose: bool = ...,
     skip_blank_lines: bool = ...,
     parse_dates: bool | Sequence[Hashable] = ...,
-    infer_datetime_format: bool = ...,
+    infer_datetime_format: bool | lib.NoDefault = ...,
     keep_date_col: bool = ...,
     date_parser=...,
     dayfirst: bool = ...,
@@ -1250,7 +1252,7 @@ def read_table(
     skip_blank_lines: bool = True,
     # Datetime Handling
     parse_dates: bool | Sequence[Hashable] = False,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: bool | lib.NoDefault = lib.no_default,
     keep_date_col: bool = False,
     date_parser=None,
     dayfirst: bool = False,
@@ -1888,10 +1890,6 @@ def TextParser(*args, **kwds) -> TextFileReader:
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     squeeze : bool, default False
         returns Series if only one column.
-    infer_datetime_format: bool, default False
-        If True and `parse_dates` is True for a column, try to infer the
-        datetime format based on the first datetime string. If the format
-        can be inferred, there often will be a large parsing speed-up.
     float_precision : str, optional
         Specifies which converter the C engine should use for floating-point
         values. The options are `None` or `high` for the ordinary converter,
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1260,7 +1260,6 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
         header=None,
         names=["foo", "bar"],
         parse_dates=["foo"],
-        infer_datetime_format=False,
         cache_dates=cache_dates,
     )
 
@@ -1288,7 +1287,6 @@ def test_bad_date_parse_with_warning(all_parsers, cache_dates, value):
         header=None,
         names=["foo", "bar"],
         parse_dates=["foo"],
-        infer_datetime_format=False,
         cache_dates=cache_dates,
     )
 
@@ -1306,6 +1304,19 @@ def test_parse_dates_empty_string(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+def test_parse_dates_infer_datetime_format_warning(all_parsers):
+    # GH 49024
+    parser = all_parsers
+    data = "Date,test\n2012-01-01,1\n,2"
+    parser.read_csv_check_warnings(
+        UserWarning,
+        "The argument 'infer_datetime_format' has been removed",
+        StringIO(data),
+        parse_dates=["Date"],
+        infer_datetime_format=True,
+    )
+
+
 @xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py