pandas-dev · MarcoGorelli · Dec 4, 2022 · Dec 4, 2022 · Dec 4, 2022 · Dec 5, 2022
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -659,6 +659,7 @@ Datetimelike
 - Bug in ``pandas.tseries.holiday.Holiday`` where a half-open date interval causes inconsistent return types from :meth:`USFederalHolidayCalendar.holidays` (:issue:`49075`)
 - Bug in rendering :class:`DatetimeIndex` and :class:`Series` and :class:`DataFrame` with timezone-aware dtypes with ``dateutil`` or ``zoneinfo`` timezones near daylight-savings transitions (:issue:`49684`)
 - Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp` or ``datetime`` objects with non-ISO8601 ``format`` (:issue:`49298`)
+- Bug in :func:`to_datetime` with ``exact`` and ``format=%Y%m%d`` wasn't raising if the input didn't match the format (:issue:`50051`)
 -
 
 Timedelta

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -546,17 +546,10 @@ cpdef array_to_datetime(
                     seen_datetime = True
                     iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
 
-                elif is_integer_object(val) or is_float_object(val):
-                    if require_iso8601:
-                        if is_coerce:
-                            iresult[i] = NPY_NAT
-                            continue
-                        elif is_raise:
-                            raise ValueError(
-                                f"time data \"{val}\" at position {i} doesn't "
-                                f"match format \"{format}\""
-                            )
-                        return values, tz_out
+                elif (
+                    (is_integer_object(val) or is_float_object(val))
+                    and format is None
+                ):
                     # these must be ns unit by-definition
                     seen_integer = True
 
@@ -575,7 +568,10 @@ cpdef array_to_datetime(
                         except OverflowError:
                             iresult[i] = NPY_NAT
 
-                elif isinstance(val, str):
+                elif (
+                    (is_integer_object(val) or is_float_object(val))
+                    or isinstance(val, str)
+                ):
                     # string
                     if type(val) is not str:
                         # GH#32264 np.str_ object

diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -27,11 +27,6 @@ def try_parse_dates(
     dayfirst: bool = ...,
     default: datetime | None = ...,
 ) -> npt.NDArray[np.object_]: ...
-def try_parse_year_month_day(
-    years: npt.NDArray[np.object_],  # object[:]
-    months: npt.NDArray[np.object_],  # object[:]
-    days: npt.NDArray[np.object_],  # object[:]
-) -> npt.NDArray[np.object_]: ...
 def try_parse_datetime_components(
     years: npt.NDArray[np.object_],  # object[:]
     months: npt.NDArray[np.object_],  # object[:]

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -744,25 +744,6 @@ def try_parse_dates(
     return result.base  # .base to access underlying ndarray
 
 
-def try_parse_year_month_day(
-    object[:] years, object[:] months, object[:] days
-) -> np.ndarray:
-    cdef:
-        Py_ssize_t i, n
-        object[::1] result
-
-    n = len(years)
-    # TODO(cython3): Use len instead of `shape[0]`
-    if months.shape[0] != n or days.shape[0] != n:
-        raise ValueError("Length of years/months/days must all be equal")
-    result = np.empty(n, dtype="O")
-
-    for i in range(n):
-        result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
-
-    return result.base  # .base to access underlying ndarray
-
-
 def try_parse_datetime_components(object[:] years,
                                   object[:] months,
                                   object[:] days,
@@ -890,7 +871,7 @@ def format_is_iso(f: str) -> bint:
     but must be consistent.  Leading 0s in dates and times are optional.
     """
     iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
-    excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
+    excluded_formats = ["%Y%m", "%Y"]
 
     for date_sep in [" ", "/", "\\", "-", ".", ""]:
         for time_sep in [" ", "T"]:

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -24,8 +24,6 @@
     Timedelta,
     Timestamp,
     iNaT,
-    nat_strings,
-    parsing,
     timezones as libtimezones,
 )
 from pandas._libs.tslibs.parsing import (
@@ -38,7 +36,6 @@
     AnyArrayLike,
     ArrayLike,
     DateTimeErrorChoices,
-    npt,
 )
 
 from pandas.core.dtypes.common import (
@@ -57,13 +54,11 @@
     ABCDataFrame,
     ABCSeries,
 )
-from pandas.core.dtypes.missing import notna
 
 from pandas.arrays import (
     DatetimeArray,
     IntegerArray,
 )
-from pandas.core import algorithms
 from pandas.core.algorithms import unique
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.datetimes import (
@@ -407,7 +402,6 @@ def _convert_listlike_datetimes(
 
     # warn if passing timedelta64, raise for PeriodDtype
     # NB: this must come after unit transformation
-    orig_arg = arg
     try:
         arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
     except TypeError:
@@ -435,8 +429,8 @@ def _convert_listlike_datetimes(
             require_iso8601 = not infer_datetime_format
 
     if format is not None and not require_iso8601:
-        res = _to_datetime_with_format(
-            arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format
+        res = _array_strptime_with_fallback(
+            arg, name, utc, format, exact, errors, infer_datetime_format
         )
         if res is not None:
             return res
@@ -510,43 +504,6 @@ def _array_strptime_with_fallback(
     return _box_as_indexlike(result, utc=utc, name=name)
 
 
-def _to_datetime_with_format(
-    arg,
-    orig_arg,
-    name,
-    utc: bool,
-    fmt: str,
-    exact: bool,
-    errors: str,
-    infer_datetime_format: bool,
-) -> Index | None:
-    """
-    Try parsing with the given format, returning None on failure.
-    """
-    result = None
-
-    # shortcut formatting here
-    if fmt == "%Y%m%d":
-        # pass orig_arg as float-dtype may have been converted to
-        # datetime64[ns]
-        orig_arg = ensure_object(orig_arg)
-        try:
-            # may return None without raising
-            result = _attempt_YYYYMMDD(orig_arg, errors=errors)
-        except (ValueError, TypeError, OutOfBoundsDatetime) as err:
-            raise ValueError(
-                "cannot convert the input to '%Y%m%d' date format"
-            ) from err
-        if result is not None:
-            return _box_as_indexlike(result, utc=utc, name=name)
-
-    # fallback
-    res = _array_strptime_with_fallback(
-        arg, name, utc, fmt, exact, errors, infer_datetime_format
-    )
-    return res
-
-
 def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
     """
     to_datetime specalized to the case where a 'unit' is passed.
@@ -1244,60 +1201,6 @@ def coerce(values):
     return values
 
 
-def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
-    """
-    try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
-    arg is a passed in as an object dtype, but could really be ints/strings
-    with nan-like/or floats (e.g. with nan)
-
-    Parameters
-    ----------
-    arg : np.ndarray[object]
-    errors : {'raise','ignore','coerce'}
-    """
-
-    def calc(carg):
-        # calculate the actual result
-        carg = carg.astype(object, copy=False)
-        parsed = parsing.try_parse_year_month_day(
-            carg / 10000, carg / 100 % 100, carg % 100
-        )
-        return tslib.array_to_datetime(parsed, errors=errors)[0]
-
-    def calc_with_mask(carg, mask):
-        result = np.empty(carg.shape, dtype="M8[ns]")
-        iresult = result.view("i8")
-        iresult[~mask] = iNaT
-
-        masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
-        result[mask] = masked_result.astype("M8[ns]")
-        return result
-
-    # try intlike / strings that are ints
-    try:
-        return calc(arg.astype(np.int64))
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    # a float with actual np.nan
-    try:
-        carg = arg.astype(np.float64)
-        return calc_with_mask(carg, notna(carg))
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    # string with NaN-like
-    try:
-        # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
-        # "Union[Union[ExtensionArray, ndarray], Index, Series]"
-        mask = ~algorithms.isin(arg, list(nat_strings))  # type: ignore[arg-type]
-        return calc_with_mask(arg, mask)
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    return None
-
-
 __all__ = [
     "DateParseError",
     "should_cache",

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -125,24 +125,21 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
         expected[2] = np.nan
         ser[2] = np.nan
 
-        result = to_datetime(ser, format="%Y%m%d", cache=cache)
-        tm.assert_series_equal(result, expected)
+        with pytest.raises(ValueError, match=None):
+            to_datetime(ser, format="%Y%m%d", cache=cache)
 
         # string with NaT
         ser2 = ser.apply(str)
         ser2[2] = "nat"
-        result = to_datetime(ser2, format="%Y%m%d", cache=cache)
-        tm.assert_series_equal(result, expected)
+        with pytest.raises(ValueError, match=None):
+            to_datetime(ser2, format="%Y%m%d", cache=cache)
 
     def test_to_datetime_format_YYYYMMDD_ignore(self, cache):
         # coercion
         # GH 7930
         ser = Series([20121231, 20141231, 99991231])
+        expected = Series([20121231, 20141231, 99991231], dtype=object)
         result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache)
-        expected = Series(
-            [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
-            dtype=object,
-        )
         tm.assert_series_equal(result, expected)
 
     def test_to_datetime_format_YYYYMMDD_coercion(self, cache):
@@ -249,7 +246,7 @@ def test_to_datetime_format_integer(self, cache):
             # valid date, length == 8
             [20121030, datetime(2012, 10, 30)],
             # short valid date, length == 6
-            [199934, datetime(1999, 3, 4)],
+            [199934, 199934],
             # long integer date partially parsed to datetime(2012,1,1), length > 8
             [2012010101, 2012010101],
             # invalid date partially parsed to datetime(2012,9,9), length == 8
@@ -1714,8 +1711,8 @@ def test_dataframe_coerce(self, cache):
         df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})
 
         msg = (
-            "cannot assemble the datetimes: time data .+ does not "
-            r"match format '%Y%m%d' \(match\)"
+            r"cannot assemble the datetimes: time data .+ doesn't "
+            r'match format "%Y%m%d"'
         )
         with pytest.raises(ValueError, match=msg):
             to_datetime(df2, cache=cache)
@@ -1791,7 +1788,10 @@ def test_dataframe_mixed(self, cache):
     def test_dataframe_float(self, cache):
         # float
         df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
-        msg = "cannot assemble the datetimes: unconverted data remains: 1"
+        msg = (
+            r'cannot assemble the datetimes: time data "20000151" at '
+            r'position 0 doesn\'t match format "%Y%m%d"'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(df, cache=cache)
 

diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
@@ -288,7 +288,7 @@ def test_parse_time_string_check_instance_type_raise_exception():
         ("%Y-%m-%dT%H:%M:%S.%f", True),
         ("%Y-%m-%dT%H:%M:%S.%f%z", True),
         ("%Y-%m-%dT%H:%M:%S.%f%Z", False),
-        ("%Y%m%d", False),
+        ("%Y%m%d", True),
         ("%Y%m", False),
         ("%Y", False),
         ("%Y-%m-%d", True),