From fd7a534d7780ba26a13dc85eb83c3ff2107cb360 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 01:52:18 +0200
Subject: [PATCH 01/37] All tests pass

---
 pandas/_libs/tslib.pyx                 |  34 ++-
 pandas/core/tools/datetimes.py         | 311 +++++++++++++++++++++----
 pandas/tests/tools/test_to_datetime.py | 309 +++++++++++++++++++++---
 3 files changed, 575 insertions(+), 79 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 106f203a16855..8b790e3bd8adc 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -69,17 +69,22 @@ from pandas._libs.tslibs.nattype cimport (
 )
 from pandas._libs.tslibs.timestamps cimport _Timestamp
 
+import cython
+
 from pandas._libs.tslibs import (
     Resolution,
     get_resolution,
 )
 from pandas._libs.tslibs.timestamps import Timestamp
 
-# Note: this is the only non-tslibs intra-pandas dependency here
+from libc.stdlib cimport srand
+from libc.time cimport time
 
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
 
+# Note: this is the only non-tslibs intra-pandas dependency here
+
 
 def _test_parse_iso8601(ts: str):
     """
@@ -398,6 +403,33 @@ def first_non_null(values: ndarray) -> int:
         return -1
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def random_non_null(values: ndarray, int n) -> ndarray:
+    """Find n non-null values selected at random, return an array of indices."""
+    cdef:
+        Py_ssize_t total = len(values)
+        Py_ssize_t i, non_null_count
+        list non_null_indices = []
+    srand(time(NULL))
+    for i in range(total):
+        val = values[i]
+        if checknull_with_nat_and_na(val):
+            continue
+        if (
+            isinstance(val, str)
+            and
+            (len(val) == 0 or val in nat_strings or val in ("now", "today"))
+        ):
+            continue
+        non_null_indices.append(i)
+    non_null_count = len(non_null_indices)
+    if non_null_count == 0 or n <= 0:
+        return np.empty(0, dtype=np.int64)
+    # use np.random.choice
+    return np.random.choice(non_null_indices, min(n, non_null_count), replace=False)
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef array_to_datetime(
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index ac0a014a3ccf6..e785fe400c631 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 from functools import partial
 from itertools import islice
+import re
 from typing import (
     TYPE_CHECKING,
     Callable,
@@ -129,27 +130,207 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
 # ---------------------------------------------------------------------
 
 
-def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
-    # Try to guess the format based on the first non-NaN element, return None if can't
-    if (first_non_null := tslib.first_non_null(arr)) != -1:
-        if type(first_non_nan_element := arr[first_non_null]) is str:
-            # GH#32264 np.str_ object
-            guessed_format = guess_datetime_format(
-                first_non_nan_element, dayfirst=dayfirst
+def _check_format_dayfirst(format_string):
+    dayfirst = False
+    for char in ["%d", "%m", "%Y"]:
+        if char not in format_string:
+            return None
+
+    if format_string.index("%d") < format_string.index("%m") and format_string.index(
+        "%m"
+    ) < format_string.index("%Y"):
+        dayfirst = True
+    elif format_string.index("%m") < format_string.index("%d") and format_string.index(
+        "%d"
+    ) < format_string.index("%Y"):
+        dayfirst = False
+    else:
+        dayfirst = None
+
+    return dayfirst
+
+
+def _guess_datetime_format_for_array(
+    arr, n_find_format, n_check_format
+) -> ArrayLike[tuple[str, str]]:
+    """
+    Guess the format of the datetime strings in an array.
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array of datetime strings.
+    n_find_format : int
+        Number of strings to use to guess the format.
+    n_check_format : int
+        Number of strings to check for each format found.
+
+    Returns
+    -------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    """
+    # Extract a random sample of datetime strings
+    assert (
+        n_find_format <= n_check_format
+    ), "n_check_format must be greater than n_find_format"
+    sample_idx = tslib.random_non_null(arr, n_check_format)
+    sample_check = arr[sample_idx]
+    sample_find = sample_check[:n_find_format]
+    if len(sample_idx) == 0:
+        return []  # FIXME
+    format_found = set()
+    for datetime_string in sample_find:
+        # catch warnings from guess_datetime_format
+        # which appears when dayfirst is contradicted
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                category=UserWarning,
+                message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
-            if guessed_format is not None:
-                return guessed_format
-            # If there are multiple non-null elements, warn about
-            # how parsing might not be consistent
-            if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
-                warnings.warn(
-                    "Could not infer format, so each element will be parsed "
-                    "individually, falling back to `dateutil`. To ensure parsing is "
-                    "consistent and as-expected, please specify a format.",
-                    UserWarning,
-                    stacklevel=find_stack_level(),
-                )
-    return None
+            if type(datetime_string) is str:
+                format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
+                format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+    if None in format_found:
+        format_found.remove(None)
+    # remove YDM as it does not exist
+    # but is returned by guess_datetime_format
+    for format in list(format_found):
+        if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format):
+            # doesn't exist but is returned by guess_datetime_format
+            # FIXME
+            format_found.remove(format)
+    # Try to apply the formats found
+    # to a larger sample
+    format_checked = []
+    for format in format_found:
+        converted = array_strptime(sample_check, fmt=format, errors="coerce")[0]
+        format_checked.append(
+            (format, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
+        )
+    # Sort by the number of strings that match the format
+    format_checked.sort(key=lambda x: x[1], reverse=True)
+    if (
+        len(format_checked) == 0
+        and len(sample_check) > 1
+        and np.any([type(e) is str for e in sample_find])
+        # GH#32264 np.str_ objects
+    ):
+        warnings.warn(
+            "Could not infer format, so each element will be parsed "
+            "individually, falling back to `dateutil`. To ensure parsing is "
+            "consistent and as-expected, please specify a format.",
+            UserWarning,
+            stacklevel=find_stack_level(),
+        )
+    return np.array(format_checked, dtype=object)
+
+
+def _try_to_repect_dayfirst(formats, dayfirst):
+    """
+    If several formats work as well, prefer the format which
+    respect dayfirst.
+
+    Parameters
+    ----------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    dayfirst : bool
+        Should we prefer dayfirst formats
+
+    Returns
+    -------
+    best_format :  str
+        The format among the best formats which respect dayfirst,
+        if any, otherwise the first best format.
+    """
+    # Find all formats which work for
+    # the largest number of samples
+    best_formats = [
+        format_found for format_found in formats if format_found[1] == formats[0][1]
+    ]
+    # If several formats work as well, prefer the format which
+    # respect dayfirst
+    if len(best_formats) > 1:
+        for format_found in best_formats:
+            if _check_format_dayfirst(format_found[0]) == dayfirst:
+                return format_found[0], _check_format_dayfirst(format_found[0])
+    return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
+
+
+def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact):
+    """
+    For mixed format, convert datetimestrings iteratively,
+    from the best format (the format which work for most samples)
+    to the worst.
+
+    Parameters
+    ----------
+    arg : ndarray
+        Array of datetime strings.
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    utc : bool
+        Whether to convert/localize timestamps to UTC.
+    unit : str
+        None or string of the frequency of the passed data
+    errors : str
+        error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
+    dayfirst : bool
+        dayfirst parsing behavior from to_datetime
+    yearfirst : bool
+        yearfirst parsing behavior from to_datetime
+    exact : bool, default True
+        exact format matching behavior from to_datetime
+
+    """
+    # iteratively convert the remaining samples
+    # in "coerce" mode with the ith best format
+    # until all values are converted or all formats are exhausted
+    # or 10 formats have been tried
+    best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
+    # remove the best format from the list
+    formats = formats[formats[:, 0] != best_format]
+    result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc)
+    indices_succeeded = notna(result)
+    for _ in range(min(len(formats), 10)):
+        best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
+        formats = formats[formats[:, 0] != best_format]
+        results_format, timezones_format = array_strptime(
+            arg[~indices_succeeded], best_format, exact, "coerce", utc
+        )
+        indices_succeeded_small = notna(results_format)
+        update_indices = np.arange(len(result))[~indices_succeeded][
+            indices_succeeded_small
+        ]
+        result[update_indices] = results_format[indices_succeeded_small]
+        tz_parsed[~indices_succeeded][indices_succeeded_small] = timezones_format[
+            indices_succeeded_small
+        ]
+        indices_succeeded[~indices_succeeded] = indices_succeeded_small
+        if indices_succeeded.all():
+            break
+    if not indices_succeeded.all():
+        # if we exhausted all formats and still have missing values
+        if errors == "raise":
+            raise ValueError(
+                f"""Unable to parse "{arg[~indices_succeeded][0]}" as a date.
+                    You can pass `errors="coerce"` or `errors="ignore"` to
+                    ignore this error."""
+            )
+        elif errors == "coerce":
+            result[~indices_succeeded] = iNaT
+        elif errors == "ignore":
+            # TODO check
+            result = arg
+    return result, tz_parsed
 
 
 def should_cache(
@@ -445,27 +626,64 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    if format is None:
-        format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
-
-    # `format` could be inferred, or user didn't ask for mixed-format parsing.
+    # get the list of formats which work for some of the elements
+    # sorted by the percentage of elements that match, highest first
+    # It's a list of tuples of (format, percentage of elements that match)
+    best_format = None
     if format is not None and format != "mixed":
-        return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
-
-    result, tz_parsed = objects_to_datetime64ns(
-        arg,
-        dayfirst=dayfirst,
-        yearfirst=yearfirst,
-        utc=utc,
-        errors=errors,
-        allow_object=True,
-    )
-
-    if tz_parsed is not None:
-        # We can take a shortcut since the datetime64 numpy array
-        # is in UTC
-        dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
-        return DatetimeIndex._simple_new(dta, name=name)
+        best_format = format
+    else:
+        # guess the format
+        formats = _guess_datetime_format_for_array(
+            arg, n_find_format=20, n_check_format=250
+        )
+        if len(formats) == 0:
+            result, tz_parsed = objects_to_datetime64ns(
+                arg,
+                dayfirst=dayfirst,
+                yearfirst=yearfirst,
+                utc=utc,
+                errors=errors,
+                allow_object=True,
+            )
+            if tz_parsed is not None:
+                # We can take a shortcut since the datetime64 numpy array
+                # is in UTC
+                dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
+                return DatetimeIndex._simple_new(dta, name=name)
+        if format != "mixed" and len(formats) > 0:
+            # formats[0][1] is the percentage of elements that matched
+            if errors == "raise" and formats[0][1] != 100:
+                raise ValueError(
+                    "No datetime format was found which "
+                    "matched all values in the array.\n"
+                    "You might want to try:\n"
+                    "    - passing `format` if your strings have a consistent format;\n"
+                    "    - passing `format='ISO8601'` if your strings are "
+                    "all ISO8601 but not necessarily in exactly the same format;\n"
+                    "    - passing `format='mixed'`, and the format will be "
+                    "inferred for each element individually. "
+                    "You might want to use `dayfirst` alongside this.\n"
+                    f"Best format found: {formats[0][0]} "
+                    "(matched {formats[0][1]}% of the values)"
+                )
+            best_format, best_format_dayfirst = _try_to_repect_dayfirst(
+                formats, dayfirst
+            )
+            if best_format_dayfirst is not None and best_format_dayfirst != dayfirst:
+                warnings.warn(
+                    f"Parsing dates in {best_format} format when "
+                    f"dayfirst={dayfirst} was specified. "
+                    f"Pass `dayfirst={not dayfirst}` or specify a format "
+                    "to silence this warning.",
+                    stacklevel=find_stack_level(),
+                )
+    if best_format is not None:
+        return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors)
+    if format == "mixed":
+        result, tz_parsed = _iterative_conversion(
+            arg, formats, utc, unit, errors, dayfirst, yearfirst, exact
+        )
 
     return _box_as_indexlike(result, utc=utc, name=name)
 
@@ -765,8 +983,9 @@ def to_datetime(
 
         - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
           time string (not necessarily in exactly the same format);
-        - "mixed", to infer the format for each element individually. This is risky,
-          and you should probably use it along with `dayfirst`.
+        - "mixed", to allow for multiple formats. Values will be parsed iteratively
+        using the most promising format at each step. This is risky,
+        and you should probably use it along with `dayfirst`.
     exact : bool, default True
         Control how `format` is used:
 
@@ -945,6 +1164,14 @@ def to_datetime(
     >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
     NaT
 
+    **Ambiguous format**
+
+    If multiple datetime formats are possible for a value, pandas will try to infer
+    the most plausible format using the other examples.
+
+    >>> pd.to_datetime(["01-02-2012", "30-01-2012"])
+    DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
     .. _to_datetime_tz_examples:
 
     **Timezones and time offsets**
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 7b707be97c653..326810177fb3a 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -1313,10 +1313,7 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=(
-                r'^time data "True" doesn\'t match format "%Y%m%d", '
-                f"at position 1. {PARSING_ERR_MSG}$"
-            ),
+            match=(f"{PARSING_ERR_MSG}"),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -1663,11 +1660,11 @@ def test_mixed_offsets_with_native_datetime_raises(self):
         mixed = to_datetime(ser)
         expected = Series(
             [
-                "NaT",
+                NaT,
                 Timestamp("1990-01-01"),
                 Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(),
                 Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(),
-                None,
+                NaT,  # TODO check
             ],
             dtype=object,
         )
@@ -1866,7 +1863,7 @@ def test_unit_with_numeric(self, cache, errors, dtype):
             [
                 ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"],
                 [1.434692e18, 1.432766e18, "foo", "NaT"],
-                None,
+                UserWarning,
             ],
         ],
     )
@@ -2404,10 +2401,7 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = (
-            r'^time data " " doesn\'t match format "%m/%d/%Y", '
-            rf"at position 2. {PARSING_ERR_MSG}$"
-        )
+        msg = rf"{PARSING_ERR_MSG}"
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2478,6 +2472,7 @@ def test_to_datetime_strings_vs_constructor(self, result):
         expected = Timestamp(2012, 1, 1)
         assert result == expected
 
+    @pytest.mark.filterwarnings("ignore:Could not infer format")
     def test_to_datetime_unprocessable_input(self, cache):
         # GH 4928
         # GH 21864
@@ -2670,10 +2665,7 @@ def test_dayfirst_warnings_invalid_input(self):
 
         with pytest.raises(
             ValueError,
-            match=(
-                r'^time data "03/30/2011" doesn\'t match format '
-                rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$'
-            ),
+            match=(rf"{PARSING_ERR_MSG}"),
         ):
             to_datetime(arr, dayfirst=True)
 
@@ -2692,32 +2684,77 @@ def test_to_datetime_dta_tz(self, klass):
 
 class TestGuessDatetimeFormat:
     @pytest.mark.parametrize(
-        "test_list",
+        "test_list, expected_formats",
         [
-            [
-                "2011-12-30 00:00:00.000000",
-                "2011-12-30 00:00:00.000000",
-                "2011-12-30 00:00:00.000000",
-            ],
-            [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
-            ["", "2011-12-30 00:00:00.000000"],
-            ["NaT", "2011-12-30 00:00:00.000000"],
-            ["2011-12-30 00:00:00.000000", "random_string"],
-            ["now", "2011-12-30 00:00:00.000000"],
-            ["today", "2011-12-30 00:00:00.000000"],
+            (
+                [
+                    "2011-12-30 00:00:00.000000",
+                    "2011-12-30 00:00:00.000000",
+                    "2011-12-30 00:00:00.000000",
+                ],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["NaT", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["2011-12-30 00:00:00.000000", "random_string"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object),
+            ),
+            (
+                ["now", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["today", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"],
+                np.array(
+                    [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)],
+                    dtype=object,
+                ),
+            ),
+            (
+                ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"],
+                np.array(
+                    [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)],
+                    dtype=object,
+                ),
+            ),
         ],
     )
-    def test_guess_datetime_format_for_array(self, test_list):
-        expected_format = "%Y-%m-%d %H:%M:%S.%f"
+    def test_guess_datetime_format_for_array(self, test_list, expected_formats):
         test_array = np.array(test_list, dtype=object)
-        assert tools._guess_datetime_format_for_array(test_array) == expected_format
+        res = tools._guess_datetime_format_for_array(
+            test_array, n_find_format=5, n_check_format=5
+        )
+        # sort according to first element of tuple (format string) to ignore order
+        sorted_index = np.argsort([x[0] for x in res])
+        res = res[sorted_index]
+        sorted_index = np.argsort([x[0] for x in expected_formats])
+        expected_formats = expected_formats[sorted_index]
+        assert (res == expected_formats).all()
+        # TODO more tests
 
     @td.skip_if_not_us_locale
     def test_guess_datetime_format_for_array_all_nans(self):
         format_for_string_of_nans = tools._guess_datetime_format_for_array(
-            np.array([np.nan, np.nan, np.nan], dtype="O")
+            np.array([np.nan, np.nan, np.nan], dtype="O"),
+            n_find_format=5,
+            n_check_format=5,
         )
-        assert format_for_string_of_nans is None
+        assert len(format_for_string_of_nans) == 0
 
 
 class TestToDatetimeInferFormat:
@@ -2741,10 +2778,7 @@ def test_to_datetime_infer_datetime_format_consistent_format(
     def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
-        msg = (
-            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
-            rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$'
-        )
+        msg = f"{PARSING_ERR_MSG}"
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, cache=cache)
 
@@ -3595,3 +3629,206 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):
     result = to_datetime(ser)
     expected = Series([1, 2], dtype="datetime64[ns]")
     tm.assert_series_equal(result, expected)
+
+
+class TestParsingMultipleDates:
+    # TODO handle yearfirst
+    @pytest.mark.parametrize(
+        "date_str, expected_format, dayfirst",
+        [
+            (["2010-01-01", "2010-02-02", "2010-01-03"], "%Y-%m-%d", None),
+            (["2010-01-01", "2010-02-13", "2010-01-03"], "%Y-%m-%d", None),
+            (["01-01-2012", "01-13-2012", "01-03-2010"], "%m-%d-%Y", False),
+            (["01-01-2012", "13-01-2012", "01-03-2010"], "%d-%m-%Y", True),
+        ],
+    )
+    def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst):
+        # only one format is possible
+        expected = to_datetime(date_str, format=expected_format)
+
+        # all errors should prefer the format
+        # which works for all dates
+        for errors in ["raise", "coerce", "ignore"]:
+            for try_dayfirst in [True, False]:
+                # warn if we contradict dayfirst
+                # we don't warn when format is "%Y-%m-%d"
+                # TODO same for yearfirst
+                if dayfirst is not None and try_dayfirst != dayfirst:
+                    with tm.assert_produces_warning(UserWarning):
+                        result = to_datetime(
+                            date_str, errors=errors, dayfirst=try_dayfirst
+                        )
+                else:
+                    result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst)
+                tm.assert_index_equal(result, expected)
+
+        # should also work with format="mixed"
+        result = to_datetime(date_str, format="mixed")
+        tm.assert_index_equal(result, expected)
+
+    # ambiguous dates
+    @pytest.mark.parametrize(
+        "date_str",
+        [
+            (["01-01-2012", "01-05-2012", "01-03-2010"]),
+            (["01-01-2012", "05-01-2012", "01-03-2010"]),
+        ],
+    )
+    def test_multiple_dates_ambiguous(self, date_str):
+        # multiple formats work for all dates
+        # we should respect the dayfirst argument
+        expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y")
+        expected_not_dayfirst = to_datetime(date_str, format="%m-%d-%Y")
+
+        for errors in ["raise", "coerce", "ignore"]:
+            result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True)
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False)
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # # should also work with format="mixed"
+        result_dayfirst = to_datetime(date_str, format="mixed", dayfirst=True)
+        tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+        result_not_dayfirst = to_datetime(date_str, format="mixed", dayfirst=False)
+        tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+    # ambiguous dates with errors
+    @pytest.mark.parametrize(
+        "date_str",
+        [
+            (["01-01-2012", "01-05-2012", "random_string", "01-03-2010"]),
+            (["01-01-2012", "05-01-2012", "random_string", "01-03-2010"]),
+        ],
+    )
+    def test_multiple_dates_ambiguous_error(self, date_str):
+        # multiple formats work for all dates
+        # we should respect the dayfirst argument
+        for errors in ["coerce", "ignore"]:
+            expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors)
+            expected_not_dayfirst = to_datetime(
+                date_str, format="%m-%d-%Y", errors=errors
+            )
+            result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True)
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False)
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # should raise an error with "raise"
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=True)
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=False)
+
+        # same with mixed
+        for errors in ["coerce", "ignore"]:
+            expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors)
+            expected_not_dayfirst = to_datetime(
+                date_str, format="%m-%d-%Y", errors=errors
+            )
+            result_dayfirst = to_datetime(
+                date_str, errors=errors, dayfirst=True, format="mixed"
+            )
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(
+                date_str, errors=errors, dayfirst=False, format="mixed"
+            )
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # should raise an error with "raise"
+        with pytest.raises(
+            ValueError, match="""Unable to parse "random_string" as a date"""
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=True, format="mixed")
+        with pytest.raises(
+            ValueError, match="""Unable to parse "random_string" as a date"""
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=False, format="mixed")
+
+    # mixed formats
+    @pytest.mark.parametrize(
+        "date_str, expected_formats, expected_mixed",
+        [
+            (
+                [
+                    "01-02-2012",
+                    "13-05-2012",
+                    "14-03-2010",
+                    "03-13-2012",
+                    "15-05-2012",
+                    "03-13-2010",
+                ],
+                ["%d-%m-%Y", "%m-%d-%Y"],
+                DatetimeIndex(
+                    [
+                        "2012-02-01",
+                        "2012-05-13",
+                        "2010-03-14",
+                        "2012-03-13",
+                        "2012-05-15",
+                        "2010-03-13",
+                    ],
+                    dtype="datetime64[ns]",
+                ),
+            ),
+            (
+                [
+                    "01-02-2012",
+                    "05-13-2012",
+                    "03-14-2010",
+                    "13-03-2012",
+                    "05-15-2012",
+                    "13-03-2010",
+                ],
+                ["%m-%d-%Y", "%d-%m-%Y"],
+                DatetimeIndex(
+                    [
+                        "2012-01-02",
+                        "2012-05-13",
+                        "2010-03-14",
+                        "2012-03-13",
+                        "2012-05-15",
+                        "2010-03-13",
+                    ],
+                    dtype="datetime64[ns]",
+                ),
+            ),
+        ],
+    )
+    def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
+        # no format works for all dates
+        # raise should raise an error
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise")
+
+        # coerce and ignore should choose the format
+        # which works for the most dates (the first one)
+        for errors in ["coerce", "ignore"]:
+            expected = to_datetime(date_str, format=expected_formats[0], errors=errors)
+            if expected_formats[0] == "%d-%m-%Y":
+                # contradicting default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    result = to_datetime(date_str, errors=errors)
+            else:
+                result = to_datetime(date_str, errors=errors)
+            tm.assert_index_equal(result, expected)
+
+        # if format="mixed", the conversion should be done from the best format
+        # to the worst format
+        result = to_datetime(date_str, format="mixed")
+        tm.assert_index_equal(result, expected_mixed)
+
+    # TODO multiple precision
+    # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None)

From 93f9c7aeb4f0be29755bd1147a2bb521bb568293 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 14:26:35 +0200
Subject: [PATCH 02/37] Update changelog

---
 doc/source/whatsnew/v2.1.0.rst | 48 +++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 38161a29a9ff7..e61a4d7702da7 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -16,7 +16,53 @@ Enhancements
 
 .. _whatsnew_210.enhancements.enhancement1:
 
-enhancement1
+``pd.to_datetime`` now tries to infer the datetime format of each string by considering
+the whole Series, and tries to find the format which work for most strings. If several
+formats work as well, the one which matches the ``dayfirst`` parameter is returned. If
+``format="mixed"``, pandas does the same thing, then tries the second best format on the
+strings which failed to parse with the first best format, and so on (:issue:`52508`).
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"])
+    Out[1]:
+    ValueError: time data "30-01-2012" doesn't match format "%m-%d-%Y", at position 2. You might want to try:
+    - passing `format` if your strings have a consistent format;
+    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
+    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
+
+    In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce")
+    Out[2]:
+    DatetimeIndex(['2012-01-02', '2012-01-03', 'NaT'], dtype='datetime64[ns]', freq=None)
+
+    In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed")
+    Out[3]:
+    DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+
+*New behavior*:
+
+.. code-block:: ipython
+
+    In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"])
+    Out[1]:
+    UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified.
+    Pass `dayfirst=True` or specify a format to silence this warning.
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]',
+    freq=None)
+
+    In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce")
+    Out[2]:
+    UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. Pass `dayfirst=True` or specify a format to silence this warning.
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+    In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed")
+    Out[3]:
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+
 ^^^^^^^^^^^^
 
 .. _whatsnew_210.enhancements.enhancement2:

From c37d40fa83002a73ddb1d50e8d759ee318127fbd Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 14:49:47 +0200
Subject: [PATCH 03/37] Add missing type hints

---
 pandas/_libs/tslib.pyi         |  1 +
 pandas/core/tools/datetimes.py | 43 ++++++++++++++++++++--------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
index 9819b5173db56..bd8748cd2650a 100644
--- a/pandas/_libs/tslib.pyi
+++ b/pandas/_libs/tslib.pyi
@@ -17,6 +17,7 @@ def array_with_unit_to_datetime(
     errors: str = ...,
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 def first_non_null(values: np.ndarray) -> int: ...
+def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ...
 def array_to_datetime(
     values: npt.NDArray[np.object_],
     errors: str = ...,
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index e785fe400c631..1a1195a536584 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -130,8 +130,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
 # ---------------------------------------------------------------------
 
 
-def _check_format_dayfirst(format_string):
-    dayfirst = False
+def _check_format_dayfirst(format_string: str) -> bool | None:
     for char in ["%d", "%m", "%Y"]:
         if char not in format_string:
             return None
@@ -151,8 +150,10 @@ def _check_format_dayfirst(format_string):
 
 
 def _guess_datetime_format_for_array(
-    arr, n_find_format, n_check_format
-) -> ArrayLike[tuple[str, str]]:
+    arr: np.ndarray,
+    n_find_format: int,
+    n_check_format: int,
+) -> np.ndarray:
     """
     Guess the format of the datetime strings in an array.
 
@@ -180,7 +181,7 @@ def _guess_datetime_format_for_array(
     sample_check = arr[sample_idx]
     sample_find = sample_check[:n_find_format]
     if len(sample_idx) == 0:
-        return []  # FIXME
+        return np.array([], dtype=object)
     format_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
@@ -194,22 +195,19 @@ def _guess_datetime_format_for_array(
             if type(datetime_string) is str:
                 format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
                 format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
-    if None in format_found:
-        format_found.remove(None)
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format in list(format_found):
-        if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format):
+    for format_ in list(format_found):
+        if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
             # doesn't exist but is returned by guess_datetime_format
-            # FIXME
-            format_found.remove(format)
+            format_found.remove(format_)
     # Try to apply the formats found
     # to a larger sample
     format_checked = []
-    for format in format_found:
-        converted = array_strptime(sample_check, fmt=format, errors="coerce")[0]
+    for format_ in format_found:
+        converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0]
         format_checked.append(
-            (format, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
+            (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
         )
     # Sort by the number of strings that match the format
     format_checked.sort(key=lambda x: x[1], reverse=True)
@@ -229,7 +227,10 @@ def _guess_datetime_format_for_array(
     return np.array(format_checked, dtype=object)
 
 
-def _try_to_repect_dayfirst(formats, dayfirst):
+def _try_to_repect_dayfirst(
+    formats: np.ndarray,
+    dayfirst: bool | None,
+) -> tuple[str, bool | None]:
     """
     If several formats work as well, prefer the format which
     respect dayfirst.
@@ -263,7 +264,16 @@ def _try_to_repect_dayfirst(formats, dayfirst):
     return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
 
 
-def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact):
+def _iterative_conversion(
+    arg: np.ndarray,
+    formats: np.ndarray,
+    utc: bool,
+    unit: str | None,
+    errors: DateTimeErrorChoices,
+    dayfirst: bool | None,
+    yearfirst: bool | None,
+    exact: bool,
+) -> tuple[np.ndarray, np.ndarray]:
     """
     For mixed format, convert datetimestrings iteratively,
     from the best format (the format which work for most samples)
@@ -328,7 +338,6 @@ def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst,
         elif errors == "coerce":
             result[~indices_succeeded] = iNaT
         elif errors == "ignore":
-            # TODO check
             result = arg
     return result, tz_parsed
 

From cbb5e0df0634abdfd79ab792c1f084d4405ce993 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:45:46 +0200
Subject: [PATCH 04/37] Cleaning

---
 pandas/_libs/tslib.pyx                 | 4 ----
 pandas/tests/tools/test_to_datetime.py | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 8b790e3bd8adc..9b2ca61e12c29 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -69,8 +69,6 @@ from pandas._libs.tslibs.nattype cimport (
 )
 from pandas._libs.tslibs.timestamps cimport _Timestamp
 
-import cython
-
 from pandas._libs.tslibs import (
     Resolution,
     get_resolution,
@@ -83,8 +81,6 @@ from libc.time cimport time
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
 
-# Note: this is the only non-tslibs intra-pandas dependency here
-
 
 def _test_parse_iso8601(ts: str):
     """
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 326810177fb3a..f33dc1603d4d5 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -3829,6 +3829,3 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
         # to the worst format
         result = to_datetime(date_str, format="mixed")
         tm.assert_index_equal(result, expected_mixed)
-
-    # TODO multiple precision
-    # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None)

From 81664a21bb9f22afd3d2dfc43bd6894a7b8e8a35 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:48:27 +0200
Subject: [PATCH 05/37] Typo

---
 doc/source/whatsnew/v2.1.0.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index e61a4d7702da7..861ecd6377c24 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -17,7 +17,8 @@ Enhancements
 .. _whatsnew_210.enhancements.enhancement1:
 
 ``pd.to_datetime`` now tries to infer the datetime format of each string by considering
-the whole Series, and tries to find the format which work for most strings. If several
+a random sample (instead of the first non-null sample),
+and tries to find the format which work for most strings. If several
 formats work as well, the one which matches the ``dayfirst`` parameter is returned. If
 ``format="mixed"``, pandas does the same thing, then tries the second best format on the
 strings which failed to parse with the first best format, and so on (:issue:`52508`).
@@ -63,8 +64,6 @@ strings which failed to parse with the first best format, and so on (:issue:`525
     DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
 
 
-^^^^^^^^^^^^
-
 .. _whatsnew_210.enhancements.enhancement2:
 
 ``map(func, na_action="ignore")`` now works for all array types

From ef33ba0d0e4987f315a6086a18878e5996de2fbf Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:52:00 +0200
Subject: [PATCH 06/37] comment change

---
 pandas/core/tools/datetimes.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 1a1195a536584..0f0b91b8005ab 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -635,14 +635,13 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    # get the list of formats which work for some of the elements
-    # sorted by the percentage of elements that match, highest first
-    # It's a list of tuples of (format, percentage of elements that match)
     best_format = None
     if format is not None and format != "mixed":
         best_format = format
     else:
-        # guess the format
+        # get a list of formats which work for some of the elements
+        # sorted by the percentage of elements that match, highest first
+        # It's a list of tuples of (format, percentage of elements that match)
         formats = _guess_datetime_format_for_array(
             arg, n_find_format=20, n_check_format=250
         )

From e1652f15b029a47956aa2d36beabc153faf0fcec Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Thu, 13 Apr 2023 13:07:32 +0200
Subject: [PATCH 07/37] simplification

---
 pandas/core/tools/datetimes.py         | 238 ++++++++++++-------------
 pandas/tests/tools/test_to_datetime.py | 111 ++++++------
 2 files changed, 170 insertions(+), 179 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 0f0b91b8005ab..61ac201fc3662 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -149,11 +149,63 @@ def _check_format_dayfirst(format_string: str) -> bool | None:
     return dayfirst
 
 
+def _try_to_repect_dayfirst(
+    formats: np.ndarray,
+    dayfirst: bool | None,
+    warn: bool,
+) -> str:
+    """
+    If several formats work as well, prefer the format which
+    respect dayfirst.
+
+    Parameters
+    ----------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    dayfirst : bool
+        Should we prefer dayfirst formats
+
+    Returns
+    -------
+    best_format :  str
+        The format among the best formats which respect dayfirst,
+        if any, otherwise the first best format.
+    """
+    # Find all formats which work for
+    # the largest number of samples
+    best_formats = [
+        formats_found for formats_found in formats if formats_found[1] == formats[0][1]
+    ]
+    # If several formats work as well, prefer the format which
+    # respect dayfirst
+    if len(best_formats) > 1:
+        for formats_found in best_formats:
+            if _check_format_dayfirst(formats_found[0]) == dayfirst:
+                return formats_found[0]
+    if (
+        warn
+        and _check_format_dayfirst(best_formats[0][0]) is not None
+        and _check_format_dayfirst(best_formats[0][0]) != dayfirst
+    ):
+        warnings.warn(
+            f"Parsing dates in {best_formats[0][0]} format when "
+            f"dayfirst={dayfirst} was specified. "
+            f"Pass `dayfirst={not dayfirst}` or specify a format "
+            "to silence this warning.",
+            stacklevel=find_stack_level(),
+        )
+    return best_formats[0][0]
+
+
 def _guess_datetime_format_for_array(
     arr: np.ndarray,
-    n_find_format: int,
-    n_check_format: int,
-) -> np.ndarray:
+    dayfirst: bool | None,
+    n_find_format: int = 10,
+    n_check_format: int = 200,
+    warn: bool = True,
+) -> str | None:
     """
     Guess the format of the datetime strings in an array.
 
@@ -165,6 +217,8 @@ def _guess_datetime_format_for_array(
         Number of strings to use to guess the format.
     n_check_format : int
         Number of strings to check for each format found.
+    warn: bool
+        Whether to warn if we contradict dayfirst
 
     Returns
     -------
@@ -181,8 +235,8 @@ def _guess_datetime_format_for_array(
     sample_check = arr[sample_idx]
     sample_find = sample_check[:n_find_format]
     if len(sample_idx) == 0:
-        return np.array([], dtype=object)
-    format_found = set()
+        return None
+    formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
         # which appears when dayfirst is contradicted
@@ -193,26 +247,26 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
-                format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+                formats_found.add(
+                    guess_datetime_format(datetime_string, dayfirst=False)
+                )
+                formats_found.add(guess_datetime_format(datetime_string, dayfirst=True))
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format_ in list(format_found):
-        if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
+    for format_ in list(formats_found):
+        if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
             # doesn't exist but is returned by guess_datetime_format
-            format_found.remove(format_)
+            formats_found.remove(format_)
     # Try to apply the formats found
     # to a larger sample
-    format_checked = []
-    for format_ in format_found:
+    formats_checked = []
+    for format_ in formats_found:
         converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0]
-        format_checked.append(
+        formats_checked.append(
             (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
         )
-    # Sort by the number of strings that match the format
-    format_checked.sort(key=lambda x: x[1], reverse=True)
     if (
-        len(format_checked) == 0
+        len(formats_checked) == 0
         and len(sample_check) > 1
         and np.any([type(e) is str for e in sample_find])
         # GH#32264 np.str_ objects
@@ -224,49 +278,18 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
-    return np.array(format_checked, dtype=object)
-
-
-def _try_to_repect_dayfirst(
-    formats: np.ndarray,
-    dayfirst: bool | None,
-) -> tuple[str, bool | None]:
-    """
-    If several formats work as well, prefer the format which
-    respect dayfirst.
-
-    Parameters
-    ----------
-    formats : ndarray
-        Array of tuples with the format and the percentage of strings that
-        match the format, sorted by the percentage of strings that match the
-        format.
-    dayfirst : bool
-        Should we prefer dayfirst formats
-
-    Returns
-    -------
-    best_format :  str
-        The format among the best formats which respect dayfirst,
-        if any, otherwise the first best format.
-    """
-    # Find all formats which work for
-    # the largest number of samples
-    best_formats = [
-        format_found for format_found in formats if format_found[1] == formats[0][1]
-    ]
-    # If several formats work as well, prefer the format which
-    # respect dayfirst
-    if len(best_formats) > 1:
-        for format_found in best_formats:
-            if _check_format_dayfirst(format_found[0]) == dayfirst:
-                return format_found[0], _check_format_dayfirst(format_found[0])
-    return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
+    if not len(formats_checked):
+        return None
+    else:
+        # Sort by the number of strings that match the format
+        formats_checked.sort(key=lambda x: x[1], reverse=True)
+        best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
+        return best_format
 
 
 def _iterative_conversion(
     arg: np.ndarray,
-    formats: np.ndarray,
+    name: str,
     utc: bool,
     unit: str | None,
     errors: DateTimeErrorChoices,
@@ -283,10 +306,8 @@ def _iterative_conversion(
     ----------
     arg : ndarray
         Array of datetime strings.
-    formats : ndarray
-        Array of tuples with the format and the percentage of strings that
-        match the format, sorted by the percentage of strings that match the
-        format.
+    name : str
+        Name of the argument.
     utc : bool
         Whether to convert/localize timestamps to UTC.
     unit : str
@@ -303,16 +324,18 @@ def _iterative_conversion(
     """
     # iteratively convert the remaining samples
     # in "coerce" mode with the ith best format
-    # until all values are converted or all formats are exhausted
     # or 10 formats have been tried
-    best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
-    # remove the best format from the list
-    formats = formats[formats[:, 0] != best_format]
+    # if we contradict dayfirst, we warn for the first format, but not the rest
+    best_format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst, warn=True)
     result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc)
     indices_succeeded = notna(result)
-    for _ in range(min(len(formats), 10)):
-        best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
-        formats = formats[formats[:, 0] != best_format]
+    for _ in range(10):
+        best_format = _guess_datetime_format_for_array(
+            arg[~indices_succeeded], dayfirst=dayfirst, warn=False
+        )
+
+        if best_format is None:
+            break
         results_format, timezones_format = array_strptime(
             arg[~indices_succeeded], best_format, exact, "coerce", utc
         )
@@ -339,7 +362,11 @@ def _iterative_conversion(
             result[~indices_succeeded] = iNaT
         elif errors == "ignore":
             result = arg
-    return result, tz_parsed
+
+    if any(tz is not None for tz in tz_parsed):
+        return _return_parsed_timezone_results(result, tz_parsed, utc, name)
+
+    return _box_as_indexlike(result, utc=utc, name=name)
 
 
 def should_cache(
@@ -635,64 +662,33 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    best_format = None
+    if format is None:
+        format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
+
+    # `format` could be inferred, or user didn't ask for mixed-format parsing.
     if format is not None and format != "mixed":
-        best_format = format
-    else:
-        # get a list of formats which work for some of the elements
-        # sorted by the percentage of elements that match, highest first
-        # It's a list of tuples of (format, percentage of elements that match)
-        formats = _guess_datetime_format_for_array(
-            arg, n_find_format=20, n_check_format=250
-        )
-        if len(formats) == 0:
-            result, tz_parsed = objects_to_datetime64ns(
-                arg,
-                dayfirst=dayfirst,
-                yearfirst=yearfirst,
-                utc=utc,
-                errors=errors,
-                allow_object=True,
-            )
-            if tz_parsed is not None:
-                # We can take a shortcut since the datetime64 numpy array
-                # is in UTC
-                dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
-                return DatetimeIndex._simple_new(dta, name=name)
-        if format != "mixed" and len(formats) > 0:
-            # formats[0][1] is the percentage of elements that matched
-            if errors == "raise" and formats[0][1] != 100:
-                raise ValueError(
-                    "No datetime format was found which "
-                    "matched all values in the array.\n"
-                    "You might want to try:\n"
-                    "    - passing `format` if your strings have a consistent format;\n"
-                    "    - passing `format='ISO8601'` if your strings are "
-                    "all ISO8601 but not necessarily in exactly the same format;\n"
-                    "    - passing `format='mixed'`, and the format will be "
-                    "inferred for each element individually. "
-                    "You might want to use `dayfirst` alongside this.\n"
-                    f"Best format found: {formats[0][0]} "
-                    "(matched {formats[0][1]}% of the values)"
-                )
-            best_format, best_format_dayfirst = _try_to_repect_dayfirst(
-                formats, dayfirst
-            )
-            if best_format_dayfirst is not None and best_format_dayfirst != dayfirst:
-                warnings.warn(
-                    f"Parsing dates in {best_format} format when "
-                    f"dayfirst={dayfirst} was specified. "
-                    f"Pass `dayfirst={not dayfirst}` or specify a format "
-                    "to silence this warning.",
-                    stacklevel=find_stack_level(),
-                )
-    if best_format is not None:
-        return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors)
+        return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
+
     if format == "mixed":
-        result, tz_parsed = _iterative_conversion(
-            arg, formats, utc, unit, errors, dayfirst, yearfirst, exact
+        return _iterative_conversion(
+            arg, name, utc, unit, errors, dayfirst, yearfirst, exact
         )
 
+    result, tz_parsed = objects_to_datetime64ns(
+        arg,
+        dayfirst=dayfirst,
+        yearfirst=yearfirst,
+        utc=utc,
+        errors=errors,
+        allow_object=True,
+    )
+
+    if tz_parsed is not None:
+        # We can take a shortcut since the datetime64 numpy array
+        # is in UTC
+        dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
+        return DatetimeIndex._simple_new(dta, name=name)
+
     return _box_as_indexlike(result, utc=utc, name=name)
 
 
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index f33dc1603d4d5..ed967c9a128b2 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -2684,7 +2684,7 @@ def test_to_datetime_dta_tz(self, klass):
 
 class TestGuessDatetimeFormat:
     @pytest.mark.parametrize(
-        "test_list, expected_formats",
+        "test_list, expected_format",
         [
             (
                 [
@@ -2692,69 +2692,37 @@ class TestGuessDatetimeFormat:
                     "2011-12-30 00:00:00.000000",
                     "2011-12-30 00:00:00.000000",
                 ],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["NaT", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["2011-12-30 00:00:00.000000", "random_string"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object),
-            ),
-            (
-                ["now", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["today", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+                "%Y-%m-%d %H:%M:%S.%f",
             ),
+            ([np.nan, np.nan, "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["NaT", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["2011-12-30 00:00:00.000000", "random_string"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["now", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["today", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
             (
                 ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"],
-                np.array(
-                    [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)],
-                    dtype=object,
-                ),
-            ),
-            (
-                ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"],
-                np.array(
-                    [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)],
-                    dtype=object,
-                ),
+                "%m-%d-%Y %H:%M:%S.%f",
             ),
+            (["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], "%m/%d/%Y"),
         ],
     )
-    def test_guess_datetime_format_for_array(self, test_list, expected_formats):
+    def test_guess_datetime_format_for_array(self, test_list, expected_format):
         test_array = np.array(test_list, dtype=object)
         res = tools._guess_datetime_format_for_array(
-            test_array, n_find_format=5, n_check_format=5
+            test_array, dayfirst=False, n_find_format=5, n_check_format=5
         )
-        # sort according to first element of tuple (format string) to ignore order
-        sorted_index = np.argsort([x[0] for x in res])
-        res = res[sorted_index]
-        sorted_index = np.argsort([x[0] for x in expected_formats])
-        expected_formats = expected_formats[sorted_index]
-        assert (res == expected_formats).all()
-        # TODO more tests
+        assert res == expected_format
 
     @td.skip_if_not_us_locale
     def test_guess_datetime_format_for_array_all_nans(self):
         format_for_string_of_nans = tools._guess_datetime_format_for_array(
             np.array([np.nan, np.nan, np.nan], dtype="O"),
+            dayfirst=False,
             n_find_format=5,
             n_check_format=5,
         )
-        assert len(format_for_string_of_nans) == 0
+        assert format_for_string_of_nans is None
 
 
 class TestToDatetimeInferFormat:
@@ -3658,13 +3626,20 @@ def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst)
                         result = to_datetime(
                             date_str, errors=errors, dayfirst=try_dayfirst
                         )
+                        # should also work for format="mixed"
+                        result_mixed = to_datetime(
+                            date_str,
+                            errors=errors,
+                            dayfirst=try_dayfirst,
+                            format="mixed",
+                        )
                 else:
                     result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst)
+                    result_mixed = to_datetime(
+                        date_str, errors=errors, dayfirst=try_dayfirst, format="mixed"
+                    )
                 tm.assert_index_equal(result, expected)
-
-        # should also work with format="mixed"
-        result = to_datetime(date_str, format="mixed")
-        tm.assert_index_equal(result, expected)
+                tm.assert_index_equal(result_mixed, expected)
 
     # ambiguous dates
     @pytest.mark.parametrize(
@@ -3719,12 +3694,16 @@ def test_multiple_dates_ambiguous_error(self, date_str):
         # should raise an error with "raise"
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match="""time data "random_string" doesn't match format "%d-%m-%Y", """
+            "at position 2. "
+            f"{PARSING_ERR_MSG}",
         ):
             to_datetime(date_str, errors="raise", dayfirst=True)
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match="""time data "random_string" doesn't match format "%m-%d-%Y", """
+            "at position 2. "
+            f"{PARSING_ERR_MSG}",
         ):
             to_datetime(date_str, errors="raise", dayfirst=False)
 
@@ -3807,12 +3786,18 @@ def test_multiple_dates_ambiguous_error(self, date_str):
     def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
         # no format works for all dates
         # raise should raise an error
+        msg = r'^time data ".*" doesn\'t match format ".*", at position .*'
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match=msg,
         ):
-            to_datetime(date_str, errors="raise")
-
+            if expected_formats[0] == "%d-%m-%Y":
+                # contradicting default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    # FIXME: do we need to raise a warning here?
+                    to_datetime(date_str, errors="raise")
+            else:
+                to_datetime(date_str, errors="raise")
         # coerce and ignore should choose the format
         # which works for the most dates (the first one)
         for errors in ["coerce", "ignore"]:
@@ -3827,5 +3812,15 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
 
         # if format="mixed", the conversion should be done from the best format
         # to the worst format
-        result = to_datetime(date_str, format="mixed")
-        tm.assert_index_equal(result, expected_mixed)
+        for errors in ["raise", "coerce", "ignore"]:
+            if expected_formats[0] == "%d-%m-%Y":
+                # we raise a warning if the best format used
+                # (the one which works for the most dates)
+                # contradict the default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    result = to_datetime(date_str, format="mixed", errors=errors)
+            else:
+                # we don't raise a warning if other formats used
+                # contradict dayfirst
+                result = to_datetime(date_str, format="mixed", errors=errors)
+            tm.assert_index_equal(result, expected_mixed)

From 6b371ca6318a0a6d473e648560d72a5b22f96e56 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Thu, 13 Apr 2023 13:48:55 +0200
Subject: [PATCH 08/37] remove randomness

---
 pandas/_libs/tslib.pyx         | 10 ++++++----
 pandas/core/tools/datetimes.py | 14 ++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 9b2ca61e12c29..fb96c9d6115c7 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -401,8 +401,8 @@ def first_non_null(values: ndarray) -> int:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def random_non_null(values: ndarray, int n) -> ndarray:
-    """Find n non-null values selected at random, return an array of indices."""
+def evenly_spaced_non_null(values: ndarray, int n) -> ndarray:
+    """Find n evenly spaced non-null values, return an array of indices."""
     cdef:
         Py_ssize_t total = len(values)
         Py_ssize_t i, non_null_count
@@ -422,8 +422,10 @@ def random_non_null(values: ndarray, int n) -> ndarray:
     non_null_count = len(non_null_indices)
     if non_null_count == 0 or n <= 0:
         return np.empty(0, dtype=np.int64)
-    # use np.random.choice
-    return np.random.choice(non_null_indices, min(n, non_null_count), replace=False)
+    evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1,
+                                        min(len(non_null_indices), n),
+                                        dtype=int)
+    return np.array(non_null_indices)[evenly_spaced_indices]
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 61ac201fc3662..05046e9850f98 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -227,15 +227,13 @@ def _guess_datetime_format_for_array(
         match the format, sorted by the percentage of strings that match the
         format.
     """
-    # Extract a random sample of datetime strings
-    assert (
-        n_find_format <= n_check_format
-    ), "n_check_format must be greater than n_find_format"
-    sample_idx = tslib.random_non_null(arr, n_check_format)
-    sample_check = arr[sample_idx]
-    sample_find = sample_check[:n_find_format]
-    if len(sample_idx) == 0:
+    # Extract a sample of datetime strings
+    idx_find = tslib.evenly_spaced_non_null(arr, n_find_format)
+    if len(idx_find) == 0:
         return None
+    idx_check = tslib.evenly_spaced_non_null(arr, n_check_format)
+    sample_check = arr[idx_check]
+    sample_find = arr[idx_find]
     formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format

From 705d1b4daca4a1469d32e14a2ac94da54a3e6ef9 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 10:08:45 +0200
Subject: [PATCH 09/37] fix parser tests

---
 pandas/core/tools/datetimes.py             |  5 +++--
 pandas/tests/io/parser/test_parse_dates.py | 17 +++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 05046e9850f98..105d8c4582d05 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -189,9 +189,10 @@ def _try_to_repect_dayfirst(
         and _check_format_dayfirst(best_formats[0][0]) is not None
         and _check_format_dayfirst(best_formats[0][0]) != dayfirst
     ):
+        default_string = " (the default)" if not dayfirst else ""
         warnings.warn(
             f"Parsing dates in {best_formats[0][0]} format when "
-            f"dayfirst={dayfirst} was specified. "
+            f"dayfirst={dayfirst}{default_string} was specified. "
             f"Pass `dayfirst={not dayfirst}` or specify a format "
             "to silence this warning.",
             stacklevel=find_stack_level(),
@@ -252,7 +253,7 @@ def _guess_datetime_format_for_array(
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
     for format_ in list(formats_found):
-        if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
+        if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_):
             # doesn't exist but is returned by guess_datetime_format
             formats_found.remove(format_)
     # Try to apply the formats found
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 8c3474220cde8..826bee7ba4dbd 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1806,11 +1806,13 @@ def test_parse_delimited_date_swap_with_warning(
 
 def test_parse_multiple_delimited_dates_with_swap_warnings():
     # GH46210
-    with pytest.raises(
-        ValueError,
+    with tm.assert_produces_warning(
+        UserWarning,
         match=(
-            r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
-            r"at position 1. You might want to try:"
+            "Parsing dates in %d/%m/%Y format when "
+            "dayfirst=False \\(the default\\) was specified. "
+            "Pass `dayfirst=True` or specify a format "
+            "to silence this warning."
         ),
     ):
         pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
@@ -2020,10 +2022,9 @@ def test_dayfirst_warnings():
     tm.assert_index_equal(expected, res5)
 
     # B. use dayfirst=False
-    with tm.assert_produces_warning(UserWarning, match=warning_msg):
-        res6 = read_csv(
-            StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
-        ).index
+    res6 = read_csv(
+        StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
+    ).index
     tm.assert_index_equal(expected, res6)
 
 

From 0bae15d44f31cb1c0b30c836d0511d30b6bfb5ba Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:18:41 +0200
Subject: [PATCH 10/37] simplify getting evenly spaced non null

---
 pandas/_libs/tslib.pyi         |  1 -
 pandas/_libs/tslib.pyx         | 32 +-------------------------------
 pandas/core/tools/datetimes.py | 17 ++++++++++++-----
 3 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
index bd8748cd2650a..9819b5173db56 100644
--- a/pandas/_libs/tslib.pyi
+++ b/pandas/_libs/tslib.pyi
@@ -17,7 +17,6 @@ def array_with_unit_to_datetime(
     errors: str = ...,
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 def first_non_null(values: np.ndarray) -> int: ...
-def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ...
 def array_to_datetime(
     values: npt.NDArray[np.object_],
     errors: str = ...,
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index fb96c9d6115c7..106f203a16855 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -75,8 +75,7 @@ from pandas._libs.tslibs import (
 )
 from pandas._libs.tslibs.timestamps import Timestamp
 
-from libc.stdlib cimport srand
-from libc.time cimport time
+# Note: this is the only non-tslibs intra-pandas dependency here
 
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
@@ -399,35 +398,6 @@ def first_non_null(values: ndarray) -> int:
         return -1
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def evenly_spaced_non_null(values: ndarray, int n) -> ndarray:
-    """Find n evenly spaced non-null values, return an array of indices."""
-    cdef:
-        Py_ssize_t total = len(values)
-        Py_ssize_t i, non_null_count
-        list non_null_indices = []
-    srand(time(NULL))
-    for i in range(total):
-        val = values[i]
-        if checknull_with_nat_and_na(val):
-            continue
-        if (
-            isinstance(val, str)
-            and
-            (len(val) == 0 or val in nat_strings or val in ("now", "today"))
-        ):
-            continue
-        non_null_indices.append(i)
-    non_null_count = len(non_null_indices)
-    if non_null_count == 0 or n <= 0:
-        return np.empty(0, dtype=np.int64)
-    evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1,
-                                        min(len(non_null_indices), n),
-                                        dtype=int)
-    return np.array(non_null_indices)[evenly_spaced_indices]
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef array_to_datetime(
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 105d8c4582d05..32a1cbaab5a9d 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -229,12 +229,19 @@ def _guess_datetime_format_for_array(
         format.
     """
     # Extract a sample of datetime strings
-    idx_find = tslib.evenly_spaced_non_null(arr, n_find_format)
-    if len(idx_find) == 0:
+    # ignore missing
+    arr_non_null = arr[notna(arr)]
+    arr_non_null = arr_non_null[
+        ~np.isin(arr_non_null, ["", "now", "today"] + list(nat_strings))
+    ]
+    if len(arr_non_null) == 0:
         return None
-    idx_check = tslib.evenly_spaced_non_null(arr, n_check_format)
-    sample_check = arr[idx_check]
-    sample_find = arr[idx_find]
+    # get evenly spaced non-null indices
+    step_find = max(len(arr_non_null) // n_find_format, 1)
+    step_check = max(len(arr_non_null) // n_check_format, 1)
+    sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)]
+    sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)]
+    # try formats
     formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format

From de7331f5dcbafba582d02855a0a36796759d1265 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:32:02 +0200
Subject: [PATCH 11/37] update io readme

---
 doc/source/user_guide/io.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index c33d4ab92d4c6..7522fecfed3fb 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -977,11 +977,10 @@ Note that format inference is sensitive to ``dayfirst``.  With
 ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With
 ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th.
 
-If you try to parse a column of date strings, pandas will attempt to guess the format
-from the first non-NaN element, and will then parse the rest of the column with that
-format. If pandas fails to guess the format (for example if your first string is
-``'01 December US/Pacific 2000'``), then a warning will be raised and each
-row will be parsed individually by ``dateutil.parser.parse``. The safest
+If you try to parse a column of date strings, pandas will attempt to find the format
+which work best from a sample of non-NaN elements, and will then parse the rest of the
+column with that format. If pandas fails to guess the format, then a warning will be
+raised and each row will be parsed individually by ``dateutil.parser.parse``. The safest
 way to parse dates is to explicitly set ``format=``.
 
 .. ipython:: python
@@ -994,7 +993,9 @@ way to parse dates is to explicitly set ``format=``.
    df
 
 In the case that you have mixed datetime formats within the same column, you can
-pass  ``format='mixed'``
+pass  ``format='mixed'``. Pandas will convert rows to the best format found (the one
+which matches the most rows), and then iteratively convert the remaining rows with the
+remaining formats.
 
 .. ipython:: python
 

From 9136b4f57fd50d2cc3e5fd0d3f7844cfff2aed97 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:41:33 +0200
Subject: [PATCH 12/37] revert changed tests

---
 pandas/tests/tools/test_to_datetime.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index ed967c9a128b2..016e30e6dfda9 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -1313,7 +1313,10 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=(f"{PARSING_ERR_MSG}"),
+            match=(
+                r'^time data "True" doesn\'t match format "%Y%m%d", '
+                f"at position 1. {PARSING_ERR_MSG}$"
+            ),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -2401,7 +2404,10 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = rf"{PARSING_ERR_MSG}"
+        msg = (
+            r'^time data " " doesn\'t match format "%m/%d/%Y", '
+            rf"at position 2. {PARSING_ERR_MSG}$"
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2746,7 +2752,10 @@ def test_to_datetime_infer_datetime_format_consistent_format(
     def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
-        msg = f"{PARSING_ERR_MSG}"
+        msg = (
+            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
+            rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, cache=cache)
 

From 9f966d56d22d5f890d40c6dfecd201f3cef56e90 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 12:51:40 +0200
Subject: [PATCH 13/37] fix type hints

---
 doc/source/whatsnew/v0.19.0.rst |  2 +-
 pandas/core/tools/datetimes.py  | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index ab17cacd830e5..c300fc7f286db 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -765,7 +765,7 @@ Previously if ``.to_datetime()`` encountered mixed integers/floats and strings,
 This will now convert integers/floats with the default unit of ``ns``.
 
 .. ipython:: python
-
+   :okwarning:
    pd.to_datetime([1, "foo"], errors="coerce")
 
 Bug fixes related to ``.to_datetime()``:
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 32a1cbaab5a9d..d33af6482d492 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -150,7 +150,7 @@ def _check_format_dayfirst(format_string: str) -> bool | None:
 
 
 def _try_to_repect_dayfirst(
-    formats: np.ndarray,
+    formats: list,
     dayfirst: bool | None,
     warn: bool,
 ) -> str:
@@ -242,7 +242,7 @@ def _guess_datetime_format_for_array(
     sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)]
     sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)]
     # try formats
-    formats_found = set()
+    formats_found = []
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
         # which appears when dayfirst is contradicted
@@ -253,14 +253,17 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                formats_found.add(
+                formats_found.append(
                     guess_datetime_format(datetime_string, dayfirst=False)
                 )
-                formats_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+                formats_found.append(
+                    guess_datetime_format(datetime_string, dayfirst=True)
+                )
+    formats_found = [format_ for format_ in formats_found if format_ is not None]
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format_ in list(formats_found):
-        if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_):
+    for format_ in np.unique(formats_found):
+        if re.match(r".*%Y.*%d.*%m.*", format_):
             # doesn't exist but is returned by guess_datetime_format
             formats_found.remove(format_)
     # Try to apply the formats found
@@ -284,25 +287,27 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
+    print(formats_checked)
     if not len(formats_checked):
         return None
     else:
         # Sort by the number of strings that match the format
         formats_checked.sort(key=lambda x: x[1], reverse=True)
         best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
+        print(best_format)
         return best_format
 
 
 def _iterative_conversion(
     arg: np.ndarray,
-    name: str,
+    name: Hashable,
     utc: bool,
     unit: str | None,
     errors: DateTimeErrorChoices,
     dayfirst: bool | None,
     yearfirst: bool | None,
     exact: bool,
-) -> tuple[np.ndarray, np.ndarray]:
+) -> Index:
     """
     For mixed format, convert datetimestrings iteratively,
     from the best format (the format which work for most samples)
@@ -313,7 +318,7 @@ def _iterative_conversion(
     arg : ndarray
         Array of datetime strings.
     name : str
-        Name of the argument.
+        None or string for the Index name
     utc : bool
         Whether to convert/localize timestamps to UTC.
     unit : str
@@ -538,7 +543,7 @@ def _convert_and_box_cache(
 
 
 def _return_parsed_timezone_results(
-    result: np.ndarray, timezones, utc: bool, name: str
+    result: np.ndarray, timezones, utc: bool, name: Hashable
 ) -> Index:
     """
     Return results from array_strptime if a %z or %Z directive was passed.
@@ -996,6 +1001,7 @@ def to_datetime(
         - "mixed", to allow for multiple formats. Values will be parsed iteratively
         using the most promising format at each step. This is risky,
         and you should probably use it along with `dayfirst`.
+
     exact : bool, default True
         Control how `format` is used:
 

From 7ca7244ce9036d3f61177be1c661d68d022708c9 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 14:42:50 +0200
Subject: [PATCH 14/37] fix type hints for np.unique

---
 pandas/core/tools/datetimes.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index d33af6482d492..ad9419f4bcd6d 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -253,13 +253,12 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                formats_found.append(
-                    guess_datetime_format(datetime_string, dayfirst=False)
-                )
-                formats_found.append(
-                    guess_datetime_format(datetime_string, dayfirst=True)
-                )
-    formats_found = [format_ for format_ in formats_found if format_ is not None]
+                for try_dayfirst in [False, True]:
+                    format_found = guess_datetime_format(
+                        datetime_string, dayfirst=try_dayfirst
+                    )
+                    if format_found is not None:
+                        formats_found.append(format_found)
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
     for format_ in np.unique(formats_found):

From 4b81192e258a4f97b1953b25ed50b971b2425114 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 15:24:32 +0200
Subject: [PATCH 15/37] remove prints

---
 pandas/core/tools/datetimes.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index ad9419f4bcd6d..cd58abace1f72 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -286,14 +286,12 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
-    print(formats_checked)
     if not len(formats_checked):
         return None
     else:
         # Sort by the number of strings that match the format
         formats_checked.sort(key=lambda x: x[1], reverse=True)
         best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
-        print(best_format)
         return best_format
 
 

From 001a270f22aae9383a2c931d0be6c0180d2b1154 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 17:12:59 +0200
Subject: [PATCH 16/37] fix doc

---
 doc/source/whatsnew/v0.19.0.rst |  1 +
 pandas/core/tools/datetimes.py  | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index c300fc7f286db..bd8b5baa5b701 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -766,6 +766,7 @@ This will now convert integers/floats with the default unit of ``ns``.
 
 .. ipython:: python
    :okwarning:
+
    pd.to_datetime([1, "foo"], errors="coerce")
 
 Bug fixes related to ``.to_datetime()``:
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index cd58abace1f72..92ebed261b529 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -214,16 +214,18 @@ def _guess_datetime_format_for_array(
     ----------
     arr : ndarray
         Array of datetime strings.
+    dayfirst : bool
+        dayfirst parsing behavior from to_datetime.
     n_find_format : int
         Number of strings to use to guess the format.
     n_check_format : int
         Number of strings to check for each format found.
-    warn: bool
-        Whether to warn if we contradict dayfirst
+    warn : bool
+        Whether to warn if we contradict dayfirst.
 
     Returns
     -------
-    formats : ndarray
+    ndarray
         Array of tuples with the format and the percentage of strings that
         match the format, sorted by the percentage of strings that match the
         format.
@@ -1182,8 +1184,8 @@ def to_datetime(
     If multiple datetime formats are possible for a value, pandas will try to infer
     the most plausible format using the other examples.
 
-    >>> pd.to_datetime(["01-02-2012", "30-01-2012"])
-    DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+    >>> pd.to_datetime(["01-02-2012", "02-30-2012"])
+    DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None)
 
     .. _to_datetime_tz_examples:
 

From fe99f83857ff1d102f6281670881cf9ec33dab07 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 17:58:08 +0200
Subject: [PATCH 17/37] fix example with febuary 30th

---
 pandas/core/tools/datetimes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 92ebed261b529..ac2846d826bc7 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -1184,8 +1184,8 @@ def to_datetime(
     If multiple datetime formats are possible for a value, pandas will try to infer
     the most plausible format using the other examples.
 
-    >>> pd.to_datetime(["01-02-2012", "02-30-2012"])
-    DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None)
+    >>> pd.to_datetime(["01-02-2012", "02-27-2012"])
+    DatetimeIndex(['2012-01-02', '2012-02-27'], dtype='datetime64[ns]', freq=None)
 
     .. _to_datetime_tz_examples:
 

From 8de90e4b1d02d0a2fcd13bcd69f4a8d31620f523 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 20:21:28 +0200
Subject: [PATCH 18/37] fix doc

---
 pandas/core/tools/datetimes.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index ac2846d826bc7..795fee0604cf0 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -997,9 +997,10 @@ def to_datetime(
 
         - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
           time string (not necessarily in exactly the same format);
+
         - "mixed", to allow for multiple formats. Values will be parsed iteratively
-        using the most promising format at each step. This is risky,
-        and you should probably use it along with `dayfirst`.
+          using the most promising format at each step. This is risky,
+          and you should probably use it along with `dayfirst`.
 
     exact : bool, default True
         Control how `format` is used:

From 281d45bc355dda8e5fbfe3c4d46563c745d519cc Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 01:52:18 +0200
Subject: [PATCH 19/37] All tests pass

---
 pandas/_libs/tslib.pyx                 |  34 ++-
 pandas/core/tools/datetimes.py         | 311 +++++++++++++++++++++----
 pandas/tests/tools/test_to_datetime.py | 309 +++++++++++++++++++++---
 3 files changed, 575 insertions(+), 79 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 106f203a16855..8b790e3bd8adc 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -69,17 +69,22 @@ from pandas._libs.tslibs.nattype cimport (
 )
 from pandas._libs.tslibs.timestamps cimport _Timestamp
 
+import cython
+
 from pandas._libs.tslibs import (
     Resolution,
     get_resolution,
 )
 from pandas._libs.tslibs.timestamps import Timestamp
 
-# Note: this is the only non-tslibs intra-pandas dependency here
+from libc.stdlib cimport srand
+from libc.time cimport time
 
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
 
+# Note: this is the only non-tslibs intra-pandas dependency here
+
 
 def _test_parse_iso8601(ts: str):
     """
@@ -398,6 +403,33 @@ def first_non_null(values: ndarray) -> int:
         return -1
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def random_non_null(values: ndarray, int n) -> ndarray:
+    """Find n non-null values selected at random, return an array of indices."""
+    cdef:
+        Py_ssize_t total = len(values)
+        Py_ssize_t i, non_null_count
+        list non_null_indices = []
+    srand(time(NULL))
+    for i in range(total):
+        val = values[i]
+        if checknull_with_nat_and_na(val):
+            continue
+        if (
+            isinstance(val, str)
+            and
+            (len(val) == 0 or val in nat_strings or val in ("now", "today"))
+        ):
+            continue
+        non_null_indices.append(i)
+    non_null_count = len(non_null_indices)
+    if non_null_count == 0 or n <= 0:
+        return np.empty(0, dtype=np.int64)
+    # use np.random.choice
+    return np.random.choice(non_null_indices, min(n, non_null_count), replace=False)
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef array_to_datetime(
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 3e1b6070ffc39..0d7e3f4999787 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 from functools import partial
 from itertools import islice
+import re
 from typing import (
     TYPE_CHECKING,
     Callable,
@@ -128,27 +129,207 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
 # ---------------------------------------------------------------------
 
 
-def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
-    # Try to guess the format based on the first non-NaN element, return None if can't
-    if (first_non_null := tslib.first_non_null(arr)) != -1:
-        if type(first_non_nan_element := arr[first_non_null]) is str:
-            # GH#32264 np.str_ object
-            guessed_format = guess_datetime_format(
-                first_non_nan_element, dayfirst=dayfirst
+def _check_format_dayfirst(format_string):
+    dayfirst = False
+    for char in ["%d", "%m", "%Y"]:
+        if char not in format_string:
+            return None
+
+    if format_string.index("%d") < format_string.index("%m") and format_string.index(
+        "%m"
+    ) < format_string.index("%Y"):
+        dayfirst = True
+    elif format_string.index("%m") < format_string.index("%d") and format_string.index(
+        "%d"
+    ) < format_string.index("%Y"):
+        dayfirst = False
+    else:
+        dayfirst = None
+
+    return dayfirst
+
+
+def _guess_datetime_format_for_array(
+    arr, n_find_format, n_check_format
+) -> ArrayLike[tuple[str, str]]:
+    """
+    Guess the format of the datetime strings in an array.
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array of datetime strings.
+    n_find_format : int
+        Number of strings to use to guess the format.
+    n_check_format : int
+        Number of strings to check for each format found.
+
+    Returns
+    -------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    """
+    # Extract a random sample of datetime strings
+    assert (
+        n_find_format <= n_check_format
+    ), "n_check_format must be greater than n_find_format"
+    sample_idx = tslib.random_non_null(arr, n_check_format)
+    sample_check = arr[sample_idx]
+    sample_find = sample_check[:n_find_format]
+    if len(sample_idx) == 0:
+        return []  # FIXME
+    format_found = set()
+    for datetime_string in sample_find:
+        # catch warnings from guess_datetime_format
+        # which appears when dayfirst is contradicted
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                category=UserWarning,
+                message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
-            if guessed_format is not None:
-                return guessed_format
-            # If there are multiple non-null elements, warn about
-            # how parsing might not be consistent
-            if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
-                warnings.warn(
-                    "Could not infer format, so each element will be parsed "
-                    "individually, falling back to `dateutil`. To ensure parsing is "
-                    "consistent and as-expected, please specify a format.",
-                    UserWarning,
-                    stacklevel=find_stack_level(),
-                )
-    return None
+            if type(datetime_string) is str:
+                format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
+                format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+    if None in format_found:
+        format_found.remove(None)
+    # remove YDM as it does not exist
+    # but is returned by guess_datetime_format
+    for format in list(format_found):
+        if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format):
+            # doesn't exist but is returned by guess_datetime_format
+            # FIXME
+            format_found.remove(format)
+    # Try to apply the formats found
+    # to a larger sample
+    format_checked = []
+    for format in format_found:
+        converted = array_strptime(sample_check, fmt=format, errors="coerce")[0]
+        format_checked.append(
+            (format, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
+        )
+    # Sort by the number of strings that match the format
+    format_checked.sort(key=lambda x: x[1], reverse=True)
+    if (
+        len(format_checked) == 0
+        and len(sample_check) > 1
+        and np.any([type(e) is str for e in sample_find])
+        # GH#32264 np.str_ objects
+    ):
+        warnings.warn(
+            "Could not infer format, so each element will be parsed "
+            "individually, falling back to `dateutil`. To ensure parsing is "
+            "consistent and as-expected, please specify a format.",
+            UserWarning,
+            stacklevel=find_stack_level(),
+        )
+    return np.array(format_checked, dtype=object)
+
+
+def _try_to_repect_dayfirst(formats, dayfirst):
+    """
+    If several formats work as well, prefer the format which
+    respect dayfirst.
+
+    Parameters
+    ----------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    dayfirst : bool
+        Should we prefer dayfirst formats
+
+    Returns
+    -------
+    best_format :  str
+        The format among the best formats which respect dayfirst,
+        if any, otherwise the first best format.
+    """
+    # Find all formats which work for
+    # the largest number of samples
+    best_formats = [
+        format_found for format_found in formats if format_found[1] == formats[0][1]
+    ]
+    # If several formats work as well, prefer the format which
+    # respect dayfirst
+    if len(best_formats) > 1:
+        for format_found in best_formats:
+            if _check_format_dayfirst(format_found[0]) == dayfirst:
+                return format_found[0], _check_format_dayfirst(format_found[0])
+    return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
+
+
+def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact):
+    """
+    For mixed format, convert datetimestrings iteratively,
+    from the best format (the format which work for most samples)
+    to the worst.
+
+    Parameters
+    ----------
+    arg : ndarray
+        Array of datetime strings.
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    utc : bool
+        Whether to convert/localize timestamps to UTC.
+    unit : str
+        None or string of the frequency of the passed data
+    errors : str
+        error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
+    dayfirst : bool
+        dayfirst parsing behavior from to_datetime
+    yearfirst : bool
+        yearfirst parsing behavior from to_datetime
+    exact : bool, default True
+        exact format matching behavior from to_datetime
+
+    """
+    # iteratively convert the remaining samples
+    # in "coerce" mode with the ith best format
+    # until all values are converted or all formats are exhausted
+    # or 10 formats have been tried
+    best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
+    # remove the best format from the list
+    formats = formats[formats[:, 0] != best_format]
+    result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc)
+    indices_succeeded = notna(result)
+    for _ in range(min(len(formats), 10)):
+        best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
+        formats = formats[formats[:, 0] != best_format]
+        results_format, timezones_format = array_strptime(
+            arg[~indices_succeeded], best_format, exact, "coerce", utc
+        )
+        indices_succeeded_small = notna(results_format)
+        update_indices = np.arange(len(result))[~indices_succeeded][
+            indices_succeeded_small
+        ]
+        result[update_indices] = results_format[indices_succeeded_small]
+        tz_parsed[~indices_succeeded][indices_succeeded_small] = timezones_format[
+            indices_succeeded_small
+        ]
+        indices_succeeded[~indices_succeeded] = indices_succeeded_small
+        if indices_succeeded.all():
+            break
+    if not indices_succeeded.all():
+        # if we exhausted all formats and still have missing values
+        if errors == "raise":
+            raise ValueError(
+                f"""Unable to parse "{arg[~indices_succeeded][0]}" as a date.
+                    You can pass `errors="coerce"` or `errors="ignore"` to
+                    ignore this error."""
+            )
+        elif errors == "coerce":
+            result[~indices_succeeded] = iNaT
+        elif errors == "ignore":
+            # TODO check
+            result = arg
+    return result, tz_parsed
 
 
 def should_cache(
@@ -444,27 +625,64 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    if format is None:
-        format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
-
-    # `format` could be inferred, or user didn't ask for mixed-format parsing.
+    # get the list of formats which work for some of the elements
+    # sorted by the percentage of elements that match, highest first
+    # It's a list of tuples of (format, percentage of elements that match)
+    best_format = None
     if format is not None and format != "mixed":
-        return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
-
-    result, tz_parsed = objects_to_datetime64ns(
-        arg,
-        dayfirst=dayfirst,
-        yearfirst=yearfirst,
-        utc=utc,
-        errors=errors,
-        allow_object=True,
-    )
-
-    if tz_parsed is not None:
-        # We can take a shortcut since the datetime64 numpy array
-        # is in UTC
-        dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
-        return DatetimeIndex._simple_new(dta, name=name)
+        best_format = format
+    else:
+        # guess the format
+        formats = _guess_datetime_format_for_array(
+            arg, n_find_format=20, n_check_format=250
+        )
+        if len(formats) == 0:
+            result, tz_parsed = objects_to_datetime64ns(
+                arg,
+                dayfirst=dayfirst,
+                yearfirst=yearfirst,
+                utc=utc,
+                errors=errors,
+                allow_object=True,
+            )
+            if tz_parsed is not None:
+                # We can take a shortcut since the datetime64 numpy array
+                # is in UTC
+                dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
+                return DatetimeIndex._simple_new(dta, name=name)
+        if format != "mixed" and len(formats) > 0:
+            # formats[0][1] is the percentage of elements that matched
+            if errors == "raise" and formats[0][1] != 100:
+                raise ValueError(
+                    "No datetime format was found which "
+                    "matched all values in the array.\n"
+                    "You might want to try:\n"
+                    "    - passing `format` if your strings have a consistent format;\n"
+                    "    - passing `format='ISO8601'` if your strings are "
+                    "all ISO8601 but not necessarily in exactly the same format;\n"
+                    "    - passing `format='mixed'`, and the format will be "
+                    "inferred for each element individually. "
+                    "You might want to use `dayfirst` alongside this.\n"
+                    f"Best format found: {formats[0][0]} "
+                    "(matched {formats[0][1]}% of the values)"
+                )
+            best_format, best_format_dayfirst = _try_to_repect_dayfirst(
+                formats, dayfirst
+            )
+            if best_format_dayfirst is not None and best_format_dayfirst != dayfirst:
+                warnings.warn(
+                    f"Parsing dates in {best_format} format when "
+                    f"dayfirst={dayfirst} was specified. "
+                    f"Pass `dayfirst={not dayfirst}` or specify a format "
+                    "to silence this warning.",
+                    stacklevel=find_stack_level(),
+                )
+    if best_format is not None:
+        return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors)
+    if format == "mixed":
+        result, tz_parsed = _iterative_conversion(
+            arg, formats, utc, unit, errors, dayfirst, yearfirst, exact
+        )
 
     return _box_as_indexlike(result, utc=utc, name=name)
 
@@ -764,8 +982,9 @@ def to_datetime(
 
         - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
           time string (not necessarily in exactly the same format);
-        - "mixed", to infer the format for each element individually. This is risky,
-          and you should probably use it along with `dayfirst`.
+        - "mixed", to allow for multiple formats. Values will be parsed iteratively
+        using the most promising format at each step. This is risky,
+        and you should probably use it along with `dayfirst`.
     exact : bool, default True
         Control how `format` is used:
 
@@ -944,6 +1163,14 @@ def to_datetime(
     >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
     NaT
 
+    **Ambiguous format**
+
+    If multiple datetime formats are possible for a value, pandas will try to infer
+    the most plausible format using the other examples.
+
+    >>> pd.to_datetime(["01-02-2012", "30-01-2012"])
+    DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
     .. _to_datetime_tz_examples:
 
     **Timezones and time offsets**
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 0b5696116e610..bf7e1f8ffc03d 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -1302,10 +1302,7 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=(
-                r'^time data "True" doesn\'t match format "%Y%m%d", '
-                f"at position 1. {PARSING_ERR_MSG}$"
-            ),
+            match=(f"{PARSING_ERR_MSG}"),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -1652,11 +1649,11 @@ def test_mixed_offsets_with_native_datetime_raises(self):
         mixed = to_datetime(ser)
         expected = Series(
             [
-                "NaT",
+                NaT,
                 Timestamp("1990-01-01"),
                 Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(),
                 Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(),
-                None,
+                NaT,  # TODO check
             ],
             dtype=object,
         )
@@ -1855,7 +1852,7 @@ def test_unit_with_numeric(self, cache, errors, dtype):
             [
                 ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"],
                 [1.434692e18, 1.432766e18, "foo", "NaT"],
-                None,
+                UserWarning,
             ],
         ],
     )
@@ -2393,10 +2390,7 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = (
-            r'^time data " " doesn\'t match format "%m/%d/%Y", '
-            rf"at position 2. {PARSING_ERR_MSG}$"
-        )
+        msg = rf"{PARSING_ERR_MSG}"
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2467,6 +2461,7 @@ def test_to_datetime_strings_vs_constructor(self, result):
         expected = Timestamp(2012, 1, 1)
         assert result == expected
 
+    @pytest.mark.filterwarnings("ignore:Could not infer format")
     def test_to_datetime_unprocessable_input(self, cache):
         # GH 4928
         # GH 21864
@@ -2659,10 +2654,7 @@ def test_dayfirst_warnings_invalid_input(self):
 
         with pytest.raises(
             ValueError,
-            match=(
-                r'^time data "03/30/2011" doesn\'t match format '
-                rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$'
-            ),
+            match=(rf"{PARSING_ERR_MSG}"),
         ):
             to_datetime(arr, dayfirst=True)
 
@@ -2681,32 +2673,77 @@ def test_to_datetime_dta_tz(self, klass):
 
 class TestGuessDatetimeFormat:
     @pytest.mark.parametrize(
-        "test_list",
+        "test_list, expected_formats",
         [
-            [
-                "2011-12-30 00:00:00.000000",
-                "2011-12-30 00:00:00.000000",
-                "2011-12-30 00:00:00.000000",
-            ],
-            [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
-            ["", "2011-12-30 00:00:00.000000"],
-            ["NaT", "2011-12-30 00:00:00.000000"],
-            ["2011-12-30 00:00:00.000000", "random_string"],
-            ["now", "2011-12-30 00:00:00.000000"],
-            ["today", "2011-12-30 00:00:00.000000"],
+            (
+                [
+                    "2011-12-30 00:00:00.000000",
+                    "2011-12-30 00:00:00.000000",
+                    "2011-12-30 00:00:00.000000",
+                ],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["NaT", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["2011-12-30 00:00:00.000000", "random_string"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object),
+            ),
+            (
+                ["now", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["today", "2011-12-30 00:00:00.000000"],
+                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+            ),
+            (
+                ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"],
+                np.array(
+                    [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)],
+                    dtype=object,
+                ),
+            ),
+            (
+                ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"],
+                np.array(
+                    [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)],
+                    dtype=object,
+                ),
+            ),
         ],
     )
-    def test_guess_datetime_format_for_array(self, test_list):
-        expected_format = "%Y-%m-%d %H:%M:%S.%f"
+    def test_guess_datetime_format_for_array(self, test_list, expected_formats):
         test_array = np.array(test_list, dtype=object)
-        assert tools._guess_datetime_format_for_array(test_array) == expected_format
+        res = tools._guess_datetime_format_for_array(
+            test_array, n_find_format=5, n_check_format=5
+        )
+        # sort according to first element of tuple (format string) to ignore order
+        sorted_index = np.argsort([x[0] for x in res])
+        res = res[sorted_index]
+        sorted_index = np.argsort([x[0] for x in expected_formats])
+        expected_formats = expected_formats[sorted_index]
+        assert (res == expected_formats).all()
+        # TODO more tests
 
     @td.skip_if_not_us_locale
     def test_guess_datetime_format_for_array_all_nans(self):
         format_for_string_of_nans = tools._guess_datetime_format_for_array(
-            np.array([np.nan, np.nan, np.nan], dtype="O")
+            np.array([np.nan, np.nan, np.nan], dtype="O"),
+            n_find_format=5,
+            n_check_format=5,
         )
-        assert format_for_string_of_nans is None
+        assert len(format_for_string_of_nans) == 0
 
 
 class TestToDatetimeInferFormat:
@@ -2730,10 +2767,7 @@ def test_to_datetime_infer_datetime_format_consistent_format(
     def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
-        msg = (
-            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
-            rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$'
-        )
+        msg = f"{PARSING_ERR_MSG}"
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, cache=cache)
 
@@ -3584,3 +3618,206 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):
     result = to_datetime(ser)
     expected = Series([1, 2], dtype="datetime64[ns]")
     tm.assert_series_equal(result, expected)
+
+
+class TestParsingMultipleDates:
+    # TODO handle yearfirst
+    @pytest.mark.parametrize(
+        "date_str, expected_format, dayfirst",
+        [
+            (["2010-01-01", "2010-02-02", "2010-01-03"], "%Y-%m-%d", None),
+            (["2010-01-01", "2010-02-13", "2010-01-03"], "%Y-%m-%d", None),
+            (["01-01-2012", "01-13-2012", "01-03-2010"], "%m-%d-%Y", False),
+            (["01-01-2012", "13-01-2012", "01-03-2010"], "%d-%m-%Y", True),
+        ],
+    )
+    def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst):
+        # only one format is possible
+        expected = to_datetime(date_str, format=expected_format)
+
+        # all errors should prefer the format
+        # which works for all dates
+        for errors in ["raise", "coerce", "ignore"]:
+            for try_dayfirst in [True, False]:
+                # warn if we contradict dayfirst
+                # we don't warn when format is "%Y-%m-%d"
+                # TODO same for yearfirst
+                if dayfirst is not None and try_dayfirst != dayfirst:
+                    with tm.assert_produces_warning(UserWarning):
+                        result = to_datetime(
+                            date_str, errors=errors, dayfirst=try_dayfirst
+                        )
+                else:
+                    result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst)
+                tm.assert_index_equal(result, expected)
+
+        # should also work with format="mixed"
+        result = to_datetime(date_str, format="mixed")
+        tm.assert_index_equal(result, expected)
+
+    # ambiguous dates
+    @pytest.mark.parametrize(
+        "date_str",
+        [
+            (["01-01-2012", "01-05-2012", "01-03-2010"]),
+            (["01-01-2012", "05-01-2012", "01-03-2010"]),
+        ],
+    )
+    def test_multiple_dates_ambiguous(self, date_str):
+        # multiple formats work for all dates
+        # we should respect the dayfirst argument
+        expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y")
+        expected_not_dayfirst = to_datetime(date_str, format="%m-%d-%Y")
+
+        for errors in ["raise", "coerce", "ignore"]:
+            result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True)
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False)
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # # should also work with format="mixed"
+        result_dayfirst = to_datetime(date_str, format="mixed", dayfirst=True)
+        tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+        result_not_dayfirst = to_datetime(date_str, format="mixed", dayfirst=False)
+        tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+    # ambiguous dates with errors
+    @pytest.mark.parametrize(
+        "date_str",
+        [
+            (["01-01-2012", "01-05-2012", "random_string", "01-03-2010"]),
+            (["01-01-2012", "05-01-2012", "random_string", "01-03-2010"]),
+        ],
+    )
+    def test_multiple_dates_ambiguous_error(self, date_str):
+        # multiple formats work for all dates
+        # we should respect the dayfirst argument
+        for errors in ["coerce", "ignore"]:
+            expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors)
+            expected_not_dayfirst = to_datetime(
+                date_str, format="%m-%d-%Y", errors=errors
+            )
+            result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True)
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False)
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # should raise an error with "raise"
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=True)
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=False)
+
+        # same with mixed
+        for errors in ["coerce", "ignore"]:
+            expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors)
+            expected_not_dayfirst = to_datetime(
+                date_str, format="%m-%d-%Y", errors=errors
+            )
+            result_dayfirst = to_datetime(
+                date_str, errors=errors, dayfirst=True, format="mixed"
+            )
+            tm.assert_index_equal(result_dayfirst, expected_dayfirst)
+
+            result_not_dayfirst = to_datetime(
+                date_str, errors=errors, dayfirst=False, format="mixed"
+            )
+            tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst)
+
+        # should raise an error with "raise"
+        with pytest.raises(
+            ValueError, match="""Unable to parse "random_string" as a date"""
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=True, format="mixed")
+        with pytest.raises(
+            ValueError, match="""Unable to parse "random_string" as a date"""
+        ):
+            to_datetime(date_str, errors="raise", dayfirst=False, format="mixed")
+
+    # mixed formats
+    @pytest.mark.parametrize(
+        "date_str, expected_formats, expected_mixed",
+        [
+            (
+                [
+                    "01-02-2012",
+                    "13-05-2012",
+                    "14-03-2010",
+                    "03-13-2012",
+                    "15-05-2012",
+                    "03-13-2010",
+                ],
+                ["%d-%m-%Y", "%m-%d-%Y"],
+                DatetimeIndex(
+                    [
+                        "2012-02-01",
+                        "2012-05-13",
+                        "2010-03-14",
+                        "2012-03-13",
+                        "2012-05-15",
+                        "2010-03-13",
+                    ],
+                    dtype="datetime64[ns]",
+                ),
+            ),
+            (
+                [
+                    "01-02-2012",
+                    "05-13-2012",
+                    "03-14-2010",
+                    "13-03-2012",
+                    "05-15-2012",
+                    "13-03-2010",
+                ],
+                ["%m-%d-%Y", "%d-%m-%Y"],
+                DatetimeIndex(
+                    [
+                        "2012-01-02",
+                        "2012-05-13",
+                        "2010-03-14",
+                        "2012-03-13",
+                        "2012-05-15",
+                        "2010-03-13",
+                    ],
+                    dtype="datetime64[ns]",
+                ),
+            ),
+        ],
+    )
+    def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
+        # no format works for all dates
+        # raise should raise an error
+        with pytest.raises(
+            ValueError,
+            match="No datetime format was found which matched all values in the array",
+        ):
+            to_datetime(date_str, errors="raise")
+
+        # coerce and ignore should choose the format
+        # which works for the most dates (the first one)
+        for errors in ["coerce", "ignore"]:
+            expected = to_datetime(date_str, format=expected_formats[0], errors=errors)
+            if expected_formats[0] == "%d-%m-%Y":
+                # contradicting default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    result = to_datetime(date_str, errors=errors)
+            else:
+                result = to_datetime(date_str, errors=errors)
+            tm.assert_index_equal(result, expected)
+
+        # if format="mixed", the conversion should be done from the best format
+        # to the worst format
+        result = to_datetime(date_str, format="mixed")
+        tm.assert_index_equal(result, expected_mixed)
+
+    # TODO multiple precision
+    # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None)

From 51d9d98da0a80c72eb2190609611543541d81c8e Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 14:26:35 +0200
Subject: [PATCH 20/37] Update changelog

---
 doc/source/whatsnew/v2.1.0.rst | 48 +++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index b0e9fa2cea0ee..f60ce987552da 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -16,7 +16,53 @@ Enhancements
 
 .. _whatsnew_210.enhancements.enhancement1:
 
-enhancement1
+``pd.to_datetime`` now tries to infer the datetime format of each string by considering
+the whole Series, and tries to find the format which work for most strings. If several
+formats work as well, the one which matches the ``dayfirst`` parameter is returned. If
+``format="mixed"``, pandas does the same thing, then tries the second best format on the
+strings which failed to parse with the first best format, and so on (:issue:`52508`).
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"])
+    Out[1]:
+    ValueError: time data "30-01-2012" doesn't match format "%m-%d-%Y", at position 2. You might want to try:
+    - passing `format` if your strings have a consistent format;
+    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
+    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
+
+    In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce")
+    Out[2]:
+    DatetimeIndex(['2012-01-02', '2012-01-03', 'NaT'], dtype='datetime64[ns]', freq=None)
+
+    In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed")
+    Out[3]:
+    DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+
+*New behavior*:
+
+.. code-block:: ipython
+
+    In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"])
+    Out[1]:
+    UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified.
+    Pass `dayfirst=True` or specify a format to silence this warning.
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]',
+    freq=None)
+
+    In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce")
+    Out[2]:
+    UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. Pass `dayfirst=True` or specify a format to silence this warning.
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+    In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed")
+    Out[3]:
+    DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+
+
 ^^^^^^^^^^^^
 
 .. _whatsnew_210.enhancements.enhancement2:

From 1d7df6eeb2966f68d8cd7b33f211e5bde7be9b27 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 14:49:47 +0200
Subject: [PATCH 21/37] Add missing type hints

---
 pandas/_libs/tslib.pyi         |  1 +
 pandas/core/tools/datetimes.py | 43 ++++++++++++++++++++--------------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
index 9819b5173db56..bd8748cd2650a 100644
--- a/pandas/_libs/tslib.pyi
+++ b/pandas/_libs/tslib.pyi
@@ -17,6 +17,7 @@ def array_with_unit_to_datetime(
     errors: str = ...,
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 def first_non_null(values: np.ndarray) -> int: ...
+def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ...
 def array_to_datetime(
     values: npt.NDArray[np.object_],
     errors: str = ...,
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 0d7e3f4999787..d0b576c4c0a8f 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -129,8 +129,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False):
 # ---------------------------------------------------------------------
 
 
-def _check_format_dayfirst(format_string):
-    dayfirst = False
+def _check_format_dayfirst(format_string: str) -> bool | None:
     for char in ["%d", "%m", "%Y"]:
         if char not in format_string:
             return None
@@ -150,8 +149,10 @@ def _check_format_dayfirst(format_string):
 
 
 def _guess_datetime_format_for_array(
-    arr, n_find_format, n_check_format
-) -> ArrayLike[tuple[str, str]]:
+    arr: np.ndarray,
+    n_find_format: int,
+    n_check_format: int,
+) -> np.ndarray:
     """
     Guess the format of the datetime strings in an array.
 
@@ -179,7 +180,7 @@ def _guess_datetime_format_for_array(
     sample_check = arr[sample_idx]
     sample_find = sample_check[:n_find_format]
     if len(sample_idx) == 0:
-        return []  # FIXME
+        return np.array([], dtype=object)
     format_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
@@ -193,22 +194,19 @@ def _guess_datetime_format_for_array(
             if type(datetime_string) is str:
                 format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
                 format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
-    if None in format_found:
-        format_found.remove(None)
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format in list(format_found):
-        if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format):
+    for format_ in list(format_found):
+        if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
             # doesn't exist but is returned by guess_datetime_format
-            # FIXME
-            format_found.remove(format)
+            format_found.remove(format_)
     # Try to apply the formats found
     # to a larger sample
     format_checked = []
-    for format in format_found:
-        converted = array_strptime(sample_check, fmt=format, errors="coerce")[0]
+    for format_ in format_found:
+        converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0]
         format_checked.append(
-            (format, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
+            (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
         )
     # Sort by the number of strings that match the format
     format_checked.sort(key=lambda x: x[1], reverse=True)
@@ -228,7 +226,10 @@ def _guess_datetime_format_for_array(
     return np.array(format_checked, dtype=object)
 
 
-def _try_to_repect_dayfirst(formats, dayfirst):
+def _try_to_repect_dayfirst(
+    formats: np.ndarray,
+    dayfirst: bool | None,
+) -> tuple[str, bool | None]:
     """
     If several formats work as well, prefer the format which
     respect dayfirst.
@@ -262,7 +263,16 @@ def _try_to_repect_dayfirst(formats, dayfirst):
     return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
 
 
-def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact):
+def _iterative_conversion(
+    arg: np.ndarray,
+    formats: np.ndarray,
+    utc: bool,
+    unit: str | None,
+    errors: DateTimeErrorChoices,
+    dayfirst: bool | None,
+    yearfirst: bool | None,
+    exact: bool,
+) -> tuple[np.ndarray, np.ndarray]:
     """
     For mixed format, convert datetimestrings iteratively,
     from the best format (the format which work for most samples)
@@ -327,7 +337,6 @@ def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst,
         elif errors == "coerce":
             result[~indices_succeeded] = iNaT
         elif errors == "ignore":
-            # TODO check
             result = arg
     return result, tz_parsed
 

From e6cf3ad1efcf7da811fe0568d8671479625856e8 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:45:46 +0200
Subject: [PATCH 22/37] Cleaning

---
 pandas/_libs/tslib.pyx                 | 4 ----
 pandas/tests/tools/test_to_datetime.py | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 8b790e3bd8adc..9b2ca61e12c29 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -69,8 +69,6 @@ from pandas._libs.tslibs.nattype cimport (
 )
 from pandas._libs.tslibs.timestamps cimport _Timestamp
 
-import cython
-
 from pandas._libs.tslibs import (
     Resolution,
     get_resolution,
@@ -83,8 +81,6 @@ from libc.time cimport time
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
 
-# Note: this is the only non-tslibs intra-pandas dependency here
-
 
 def _test_parse_iso8601(ts: str):
     """
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index bf7e1f8ffc03d..b8f58dda81a5e 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -3818,6 +3818,3 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
         # to the worst format
         result = to_datetime(date_str, format="mixed")
         tm.assert_index_equal(result, expected_mixed)
-
-    # TODO multiple precision
-    # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None)

From 6998bf8ad5041073f0c19cc206014b052371291d Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:48:27 +0200
Subject: [PATCH 23/37] Typo

---
 doc/source/whatsnew/v2.1.0.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index f60ce987552da..d5ad1e111dd64 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -17,7 +17,8 @@ Enhancements
 .. _whatsnew_210.enhancements.enhancement1:
 
 ``pd.to_datetime`` now tries to infer the datetime format of each string by considering
-the whole Series, and tries to find the format which work for most strings. If several
+a random sample (instead of the first non-null sample),
+and tries to find the format which work for most strings. If several
 formats work as well, the one which matches the ``dayfirst`` parameter is returned. If
 ``format="mixed"``, pandas does the same thing, then tries the second best format on the
 strings which failed to parse with the first best format, and so on (:issue:`52508`).
@@ -63,8 +64,6 @@ strings which failed to parse with the first best format, and so on (:issue:`525
     DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
 
 
-^^^^^^^^^^^^
-
 .. _whatsnew_210.enhancements.enhancement2:
 
 ``map(func, na_action="ignore")`` now works for all array types

From f98ea1f86fa3f9e933ab3c54771bd9930c054a3b Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Wed, 12 Apr 2023 15:52:00 +0200
Subject: [PATCH 24/37] comment change

---
 pandas/core/tools/datetimes.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index d0b576c4c0a8f..66ae3f8cbb6a5 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -634,14 +634,13 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    # get the list of formats which work for some of the elements
-    # sorted by the percentage of elements that match, highest first
-    # It's a list of tuples of (format, percentage of elements that match)
     best_format = None
     if format is not None and format != "mixed":
         best_format = format
     else:
-        # guess the format
+        # get a list of formats which work for some of the elements
+        # sorted by the percentage of elements that match, highest first
+        # It's a list of tuples of (format, percentage of elements that match)
         formats = _guess_datetime_format_for_array(
             arg, n_find_format=20, n_check_format=250
         )

From 86aa61c264df46f1e6e8b21e46de58bdea896786 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Thu, 13 Apr 2023 13:07:32 +0200
Subject: [PATCH 25/37] simplification

---
 pandas/core/tools/datetimes.py         | 238 ++++++++++++-------------
 pandas/tests/tools/test_to_datetime.py | 111 ++++++------
 2 files changed, 170 insertions(+), 179 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 66ae3f8cbb6a5..8f16d16d37a07 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -148,11 +148,63 @@ def _check_format_dayfirst(format_string: str) -> bool | None:
     return dayfirst
 
 
+def _try_to_repect_dayfirst(
+    formats: np.ndarray,
+    dayfirst: bool | None,
+    warn: bool,
+) -> str:
+    """
+    If several formats work as well, prefer the format which
+    respect dayfirst.
+
+    Parameters
+    ----------
+    formats : ndarray
+        Array of tuples with the format and the percentage of strings that
+        match the format, sorted by the percentage of strings that match the
+        format.
+    dayfirst : bool
+        Should we prefer dayfirst formats
+
+    Returns
+    -------
+    best_format :  str
+        The format among the best formats which respect dayfirst,
+        if any, otherwise the first best format.
+    """
+    # Find all formats which work for
+    # the largest number of samples
+    best_formats = [
+        formats_found for formats_found in formats if formats_found[1] == formats[0][1]
+    ]
+    # If several formats work as well, prefer the format which
+    # respect dayfirst
+    if len(best_formats) > 1:
+        for formats_found in best_formats:
+            if _check_format_dayfirst(formats_found[0]) == dayfirst:
+                return formats_found[0]
+    if (
+        warn
+        and _check_format_dayfirst(best_formats[0][0]) is not None
+        and _check_format_dayfirst(best_formats[0][0]) != dayfirst
+    ):
+        warnings.warn(
+            f"Parsing dates in {best_formats[0][0]} format when "
+            f"dayfirst={dayfirst} was specified. "
+            f"Pass `dayfirst={not dayfirst}` or specify a format "
+            "to silence this warning.",
+            stacklevel=find_stack_level(),
+        )
+    return best_formats[0][0]
+
+
 def _guess_datetime_format_for_array(
     arr: np.ndarray,
-    n_find_format: int,
-    n_check_format: int,
-) -> np.ndarray:
+    dayfirst: bool | None,
+    n_find_format: int = 10,
+    n_check_format: int = 200,
+    warn: bool = True,
+) -> str | None:
     """
     Guess the format of the datetime strings in an array.
 
@@ -164,6 +216,8 @@ def _guess_datetime_format_for_array(
         Number of strings to use to guess the format.
     n_check_format : int
         Number of strings to check for each format found.
+    warn: bool
+        Whether to warn if we contradict dayfirst
 
     Returns
     -------
@@ -180,8 +234,8 @@ def _guess_datetime_format_for_array(
     sample_check = arr[sample_idx]
     sample_find = sample_check[:n_find_format]
     if len(sample_idx) == 0:
-        return np.array([], dtype=object)
-    format_found = set()
+        return None
+    formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
         # which appears when dayfirst is contradicted
@@ -192,26 +246,26 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                format_found.add(guess_datetime_format(datetime_string, dayfirst=False))
-                format_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+                formats_found.add(
+                    guess_datetime_format(datetime_string, dayfirst=False)
+                )
+                formats_found.add(guess_datetime_format(datetime_string, dayfirst=True))
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format_ in list(format_found):
-        if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
+    for format_ in list(formats_found):
+        if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
             # doesn't exist but is returned by guess_datetime_format
-            format_found.remove(format_)
+            formats_found.remove(format_)
     # Try to apply the formats found
     # to a larger sample
-    format_checked = []
-    for format_ in format_found:
+    formats_checked = []
+    for format_ in formats_found:
         converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0]
-        format_checked.append(
+        formats_checked.append(
             (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
         )
-    # Sort by the number of strings that match the format
-    format_checked.sort(key=lambda x: x[1], reverse=True)
     if (
-        len(format_checked) == 0
+        len(formats_checked) == 0
         and len(sample_check) > 1
         and np.any([type(e) is str for e in sample_find])
         # GH#32264 np.str_ objects
@@ -223,49 +277,18 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
-    return np.array(format_checked, dtype=object)
-
-
-def _try_to_repect_dayfirst(
-    formats: np.ndarray,
-    dayfirst: bool | None,
-) -> tuple[str, bool | None]:
-    """
-    If several formats work as well, prefer the format which
-    respect dayfirst.
-
-    Parameters
-    ----------
-    formats : ndarray
-        Array of tuples with the format and the percentage of strings that
-        match the format, sorted by the percentage of strings that match the
-        format.
-    dayfirst : bool
-        Should we prefer dayfirst formats
-
-    Returns
-    -------
-    best_format :  str
-        The format among the best formats which respect dayfirst,
-        if any, otherwise the first best format.
-    """
-    # Find all formats which work for
-    # the largest number of samples
-    best_formats = [
-        format_found for format_found in formats if format_found[1] == formats[0][1]
-    ]
-    # If several formats work as well, prefer the format which
-    # respect dayfirst
-    if len(best_formats) > 1:
-        for format_found in best_formats:
-            if _check_format_dayfirst(format_found[0]) == dayfirst:
-                return format_found[0], _check_format_dayfirst(format_found[0])
-    return best_formats[0][0], _check_format_dayfirst(best_formats[0][0])
+    if not len(formats_checked):
+        return None
+    else:
+        # Sort by the number of strings that match the format
+        formats_checked.sort(key=lambda x: x[1], reverse=True)
+        best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
+        return best_format
 
 
 def _iterative_conversion(
     arg: np.ndarray,
-    formats: np.ndarray,
+    name: str,
     utc: bool,
     unit: str | None,
     errors: DateTimeErrorChoices,
@@ -282,10 +305,8 @@ def _iterative_conversion(
     ----------
     arg : ndarray
         Array of datetime strings.
-    formats : ndarray
-        Array of tuples with the format and the percentage of strings that
-        match the format, sorted by the percentage of strings that match the
-        format.
+    name : str
+        Name of the argument.
     utc : bool
         Whether to convert/localize timestamps to UTC.
     unit : str
@@ -302,16 +323,18 @@ def _iterative_conversion(
     """
     # iteratively convert the remaining samples
     # in "coerce" mode with the ith best format
-    # until all values are converted or all formats are exhausted
     # or 10 formats have been tried
-    best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
-    # remove the best format from the list
-    formats = formats[formats[:, 0] != best_format]
+    # if we contradict dayfirst, we warn for the first format, but not the rest
+    best_format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst, warn=True)
     result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc)
     indices_succeeded = notna(result)
-    for _ in range(min(len(formats), 10)):
-        best_format = _try_to_repect_dayfirst(formats, dayfirst)[0]
-        formats = formats[formats[:, 0] != best_format]
+    for _ in range(10):
+        best_format = _guess_datetime_format_for_array(
+            arg[~indices_succeeded], dayfirst=dayfirst, warn=False
+        )
+
+        if best_format is None:
+            break
         results_format, timezones_format = array_strptime(
             arg[~indices_succeeded], best_format, exact, "coerce", utc
         )
@@ -338,7 +361,11 @@ def _iterative_conversion(
             result[~indices_succeeded] = iNaT
         elif errors == "ignore":
             result = arg
-    return result, tz_parsed
+
+    if any(tz is not None for tz in tz_parsed):
+        return _return_parsed_timezone_results(result, tz_parsed, utc, name)
+
+    return _box_as_indexlike(result, utc=utc, name=name)
 
 
 def should_cache(
@@ -634,64 +661,33 @@ def _convert_listlike_datetimes(
 
     arg = ensure_object(arg)
 
-    best_format = None
+    if format is None:
+        format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
+
+    # `format` could be inferred, or user didn't ask for mixed-format parsing.
     if format is not None and format != "mixed":
-        best_format = format
-    else:
-        # get a list of formats which work for some of the elements
-        # sorted by the percentage of elements that match, highest first
-        # It's a list of tuples of (format, percentage of elements that match)
-        formats = _guess_datetime_format_for_array(
-            arg, n_find_format=20, n_check_format=250
-        )
-        if len(formats) == 0:
-            result, tz_parsed = objects_to_datetime64ns(
-                arg,
-                dayfirst=dayfirst,
-                yearfirst=yearfirst,
-                utc=utc,
-                errors=errors,
-                allow_object=True,
-            )
-            if tz_parsed is not None:
-                # We can take a shortcut since the datetime64 numpy array
-                # is in UTC
-                dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
-                return DatetimeIndex._simple_new(dta, name=name)
-        if format != "mixed" and len(formats) > 0:
-            # formats[0][1] is the percentage of elements that matched
-            if errors == "raise" and formats[0][1] != 100:
-                raise ValueError(
-                    "No datetime format was found which "
-                    "matched all values in the array.\n"
-                    "You might want to try:\n"
-                    "    - passing `format` if your strings have a consistent format;\n"
-                    "    - passing `format='ISO8601'` if your strings are "
-                    "all ISO8601 but not necessarily in exactly the same format;\n"
-                    "    - passing `format='mixed'`, and the format will be "
-                    "inferred for each element individually. "
-                    "You might want to use `dayfirst` alongside this.\n"
-                    f"Best format found: {formats[0][0]} "
-                    "(matched {formats[0][1]}% of the values)"
-                )
-            best_format, best_format_dayfirst = _try_to_repect_dayfirst(
-                formats, dayfirst
-            )
-            if best_format_dayfirst is not None and best_format_dayfirst != dayfirst:
-                warnings.warn(
-                    f"Parsing dates in {best_format} format when "
-                    f"dayfirst={dayfirst} was specified. "
-                    f"Pass `dayfirst={not dayfirst}` or specify a format "
-                    "to silence this warning.",
-                    stacklevel=find_stack_level(),
-                )
-    if best_format is not None:
-        return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors)
+        return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
+
     if format == "mixed":
-        result, tz_parsed = _iterative_conversion(
-            arg, formats, utc, unit, errors, dayfirst, yearfirst, exact
+        return _iterative_conversion(
+            arg, name, utc, unit, errors, dayfirst, yearfirst, exact
         )
 
+    result, tz_parsed = objects_to_datetime64ns(
+        arg,
+        dayfirst=dayfirst,
+        yearfirst=yearfirst,
+        utc=utc,
+        errors=errors,
+        allow_object=True,
+    )
+
+    if tz_parsed is not None:
+        # We can take a shortcut since the datetime64 numpy array
+        # is in UTC
+        dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
+        return DatetimeIndex._simple_new(dta, name=name)
+
     return _box_as_indexlike(result, utc=utc, name=name)
 
 
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index b8f58dda81a5e..5f5f3bfb1d377 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -2673,7 +2673,7 @@ def test_to_datetime_dta_tz(self, klass):
 
 class TestGuessDatetimeFormat:
     @pytest.mark.parametrize(
-        "test_list, expected_formats",
+        "test_list, expected_format",
         [
             (
                 [
@@ -2681,69 +2681,37 @@ class TestGuessDatetimeFormat:
                     "2011-12-30 00:00:00.000000",
                     "2011-12-30 00:00:00.000000",
                 ],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["NaT", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["2011-12-30 00:00:00.000000", "random_string"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object),
-            ),
-            (
-                ["now", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
-            ),
-            (
-                ["today", "2011-12-30 00:00:00.000000"],
-                np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object),
+                "%Y-%m-%d %H:%M:%S.%f",
             ),
+            ([np.nan, np.nan, "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["NaT", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["2011-12-30 00:00:00.000000", "random_string"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["now", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
+            (["today", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"),
             (
                 ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"],
-                np.array(
-                    [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)],
-                    dtype=object,
-                ),
-            ),
-            (
-                ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"],
-                np.array(
-                    [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)],
-                    dtype=object,
-                ),
+                "%m-%d-%Y %H:%M:%S.%f",
             ),
+            (["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], "%m/%d/%Y"),
         ],
     )
-    def test_guess_datetime_format_for_array(self, test_list, expected_formats):
+    def test_guess_datetime_format_for_array(self, test_list, expected_format):
         test_array = np.array(test_list, dtype=object)
         res = tools._guess_datetime_format_for_array(
-            test_array, n_find_format=5, n_check_format=5
+            test_array, dayfirst=False, n_find_format=5, n_check_format=5
         )
-        # sort according to first element of tuple (format string) to ignore order
-        sorted_index = np.argsort([x[0] for x in res])
-        res = res[sorted_index]
-        sorted_index = np.argsort([x[0] for x in expected_formats])
-        expected_formats = expected_formats[sorted_index]
-        assert (res == expected_formats).all()
-        # TODO more tests
+        assert res == expected_format
 
     @td.skip_if_not_us_locale
     def test_guess_datetime_format_for_array_all_nans(self):
         format_for_string_of_nans = tools._guess_datetime_format_for_array(
             np.array([np.nan, np.nan, np.nan], dtype="O"),
+            dayfirst=False,
             n_find_format=5,
             n_check_format=5,
         )
-        assert len(format_for_string_of_nans) == 0
+        assert format_for_string_of_nans is None
 
 
 class TestToDatetimeInferFormat:
@@ -3647,13 +3615,20 @@ def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst)
                         result = to_datetime(
                             date_str, errors=errors, dayfirst=try_dayfirst
                         )
+                        # should also work for format="mixed"
+                        result_mixed = to_datetime(
+                            date_str,
+                            errors=errors,
+                            dayfirst=try_dayfirst,
+                            format="mixed",
+                        )
                 else:
                     result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst)
+                    result_mixed = to_datetime(
+                        date_str, errors=errors, dayfirst=try_dayfirst, format="mixed"
+                    )
                 tm.assert_index_equal(result, expected)
-
-        # should also work with format="mixed"
-        result = to_datetime(date_str, format="mixed")
-        tm.assert_index_equal(result, expected)
+                tm.assert_index_equal(result_mixed, expected)
 
     # ambiguous dates
     @pytest.mark.parametrize(
@@ -3708,12 +3683,16 @@ def test_multiple_dates_ambiguous_error(self, date_str):
         # should raise an error with "raise"
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match="""time data "random_string" doesn't match format "%d-%m-%Y", """
+            "at position 2. "
+            f"{PARSING_ERR_MSG}",
         ):
             to_datetime(date_str, errors="raise", dayfirst=True)
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match="""time data "random_string" doesn't match format "%m-%d-%Y", """
+            "at position 2. "
+            f"{PARSING_ERR_MSG}",
         ):
             to_datetime(date_str, errors="raise", dayfirst=False)
 
@@ -3796,12 +3775,18 @@ def test_multiple_dates_ambiguous_error(self, date_str):
     def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
         # no format works for all dates
         # raise should raise an error
+        msg = r'^time data ".*" doesn\'t match format ".*", at position .*'
         with pytest.raises(
             ValueError,
-            match="No datetime format was found which matched all values in the array",
+            match=msg,
         ):
-            to_datetime(date_str, errors="raise")
-
+            if expected_formats[0] == "%d-%m-%Y":
+                # contradicting default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    # FIXME: do we need to raise a warning here?
+                    to_datetime(date_str, errors="raise")
+            else:
+                to_datetime(date_str, errors="raise")
         # coerce and ignore should choose the format
         # which works for the most dates (the first one)
         for errors in ["coerce", "ignore"]:
@@ -3816,5 +3801,15 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed):
 
         # if format="mixed", the conversion should be done from the best format
         # to the worst format
-        result = to_datetime(date_str, format="mixed")
-        tm.assert_index_equal(result, expected_mixed)
+        for errors in ["raise", "coerce", "ignore"]:
+            if expected_formats[0] == "%d-%m-%Y":
+                # we raise a warning if the best format used
+                # (the one which works for the most dates)
+                # contradict the default dayfirst=False
+                with tm.assert_produces_warning(UserWarning):
+                    result = to_datetime(date_str, format="mixed", errors=errors)
+            else:
+                # we don't raise a warning if other formats used
+                # contradict dayfirst
+                result = to_datetime(date_str, format="mixed", errors=errors)
+            tm.assert_index_equal(result, expected_mixed)

From 8c6401b9b30fe240230138ebb83c66badffb350f Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Thu, 13 Apr 2023 13:48:55 +0200
Subject: [PATCH 26/37] remove randomness

---
 pandas/_libs/tslib.pyx         | 10 ++++++----
 pandas/core/tools/datetimes.py | 14 ++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 9b2ca61e12c29..fb96c9d6115c7 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -401,8 +401,8 @@ def first_non_null(values: ndarray) -> int:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def random_non_null(values: ndarray, int n) -> ndarray:
-    """Find n non-null values selected at random, return an array of indices."""
+def evenly_spaced_non_null(values: ndarray, int n) -> ndarray:
+    """Find n evenly spaced non-null values, return an array of indices."""
     cdef:
         Py_ssize_t total = len(values)
         Py_ssize_t i, non_null_count
@@ -422,8 +422,10 @@ def random_non_null(values: ndarray, int n) -> ndarray:
     non_null_count = len(non_null_indices)
     if non_null_count == 0 or n <= 0:
         return np.empty(0, dtype=np.int64)
-    # use np.random.choice
-    return np.random.choice(non_null_indices, min(n, non_null_count), replace=False)
+    evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1,
+                                        min(len(non_null_indices), n),
+                                        dtype=int)
+    return np.array(non_null_indices)[evenly_spaced_indices]
 
 
 @cython.wraparound(False)
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 8f16d16d37a07..8dd72bb3a9984 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -226,15 +226,13 @@ def _guess_datetime_format_for_array(
         match the format, sorted by the percentage of strings that match the
         format.
     """
-    # Extract a random sample of datetime strings
-    assert (
-        n_find_format <= n_check_format
-    ), "n_check_format must be greater than n_find_format"
-    sample_idx = tslib.random_non_null(arr, n_check_format)
-    sample_check = arr[sample_idx]
-    sample_find = sample_check[:n_find_format]
-    if len(sample_idx) == 0:
+    # Extract a sample of datetime strings
+    idx_find = tslib.evenly_spaced_non_null(arr, n_find_format)
+    if len(idx_find) == 0:
         return None
+    idx_check = tslib.evenly_spaced_non_null(arr, n_check_format)
+    sample_check = arr[idx_check]
+    sample_find = arr[idx_find]
     formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format

From 28cf6796d23f6bf9ed8dcb537eba2a4e935cbb3e Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 10:08:45 +0200
Subject: [PATCH 27/37] fix parser tests

---
 pandas/core/tools/datetimes.py             |  5 +++--
 pandas/tests/io/parser/test_parse_dates.py | 17 +++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 8dd72bb3a9984..18a7d621072ae 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -188,9 +188,10 @@ def _try_to_repect_dayfirst(
         and _check_format_dayfirst(best_formats[0][0]) is not None
         and _check_format_dayfirst(best_formats[0][0]) != dayfirst
     ):
+        default_string = " (the default)" if not dayfirst else ""
         warnings.warn(
             f"Parsing dates in {best_formats[0][0]} format when "
-            f"dayfirst={dayfirst} was specified. "
+            f"dayfirst={dayfirst}{default_string} was specified. "
             f"Pass `dayfirst={not dayfirst}` or specify a format "
             "to silence this warning.",
             stacklevel=find_stack_level(),
@@ -251,7 +252,7 @@ def _guess_datetime_format_for_array(
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
     for format_ in list(formats_found):
-        if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_):
+        if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_):
             # doesn't exist but is returned by guess_datetime_format
             formats_found.remove(format_)
     # Try to apply the formats found
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 55efb9254ee34..5c82b652ebace 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1794,11 +1794,13 @@ def test_parse_delimited_date_swap_with_warning(
 
 def test_parse_multiple_delimited_dates_with_swap_warnings():
     # GH46210
-    with pytest.raises(
-        ValueError,
+    with tm.assert_produces_warning(
+        UserWarning,
         match=(
-            r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
-            r"at position 1. You might want to try:"
+            "Parsing dates in %d/%m/%Y format when "
+            "dayfirst=False \\(the default\\) was specified. "
+            "Pass `dayfirst=True` or specify a format "
+            "to silence this warning."
         ),
     ):
         pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])
@@ -2008,10 +2010,9 @@ def test_dayfirst_warnings():
     tm.assert_index_equal(expected, res5)
 
     # B. use dayfirst=False
-    with tm.assert_produces_warning(UserWarning, match=warning_msg):
-        res6 = read_csv(
-            StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
-        ).index
+    res6 = read_csv(
+        StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
+    ).index
     tm.assert_index_equal(expected, res6)
 
 

From a22114ca6769ac645f2f6d9a611a4bd88176b3ab Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:18:41 +0200
Subject: [PATCH 28/37] simplify getting evenly spaced non null

---
 pandas/_libs/tslib.pyi         |  1 -
 pandas/_libs/tslib.pyx         | 32 +-------------------------------
 pandas/core/tools/datetimes.py | 17 ++++++++++++-----
 3 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
index bd8748cd2650a..9819b5173db56 100644
--- a/pandas/_libs/tslib.pyi
+++ b/pandas/_libs/tslib.pyi
@@ -17,7 +17,6 @@ def array_with_unit_to_datetime(
     errors: str = ...,
 ) -> tuple[np.ndarray, tzinfo | None]: ...
 def first_non_null(values: np.ndarray) -> int: ...
-def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ...
 def array_to_datetime(
     values: npt.NDArray[np.object_],
     errors: str = ...,
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index fb96c9d6115c7..106f203a16855 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -75,8 +75,7 @@ from pandas._libs.tslibs import (
 )
 from pandas._libs.tslibs.timestamps import Timestamp
 
-from libc.stdlib cimport srand
-from libc.time cimport time
+# Note: this is the only non-tslibs intra-pandas dependency here
 
 from pandas._libs.missing cimport checknull_with_nat_and_na
 from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
@@ -399,35 +398,6 @@ def first_non_null(values: ndarray) -> int:
         return -1
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def evenly_spaced_non_null(values: ndarray, int n) -> ndarray:
-    """Find n evenly spaced non-null values, return an array of indices."""
-    cdef:
-        Py_ssize_t total = len(values)
-        Py_ssize_t i, non_null_count
-        list non_null_indices = []
-    srand(time(NULL))
-    for i in range(total):
-        val = values[i]
-        if checknull_with_nat_and_na(val):
-            continue
-        if (
-            isinstance(val, str)
-            and
-            (len(val) == 0 or val in nat_strings or val in ("now", "today"))
-        ):
-            continue
-        non_null_indices.append(i)
-    non_null_count = len(non_null_indices)
-    if non_null_count == 0 or n <= 0:
-        return np.empty(0, dtype=np.int64)
-    evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1,
-                                        min(len(non_null_indices), n),
-                                        dtype=int)
-    return np.array(non_null_indices)[evenly_spaced_indices]
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef array_to_datetime(
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 18a7d621072ae..bd5c128021a90 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -228,12 +228,19 @@ def _guess_datetime_format_for_array(
         format.
     """
     # Extract a sample of datetime strings
-    idx_find = tslib.evenly_spaced_non_null(arr, n_find_format)
-    if len(idx_find) == 0:
+    # ignore missing
+    arr_non_null = arr[notna(arr)]
+    arr_non_null = arr_non_null[
+        ~np.isin(arr_non_null, ["", "now", "today"] + list(nat_strings))
+    ]
+    if len(arr_non_null) == 0:
         return None
-    idx_check = tslib.evenly_spaced_non_null(arr, n_check_format)
-    sample_check = arr[idx_check]
-    sample_find = arr[idx_find]
+    # get evenly spaced non-null indices
+    step_find = max(len(arr_non_null) // n_find_format, 1)
+    step_check = max(len(arr_non_null) // n_check_format, 1)
+    sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)]
+    sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)]
+    # try formats
     formats_found = set()
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format

From 75bb8f64490d93090e62949709779de44acf28c9 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:32:02 +0200
Subject: [PATCH 29/37] update io readme

---
 doc/source/user_guide/io.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 60353dde5683f..ec677ea1030c0 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -977,11 +977,10 @@ Note that format inference is sensitive to ``dayfirst``.  With
 ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With
 ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th.
 
-If you try to parse a column of date strings, pandas will attempt to guess the format
-from the first non-NaN element, and will then parse the rest of the column with that
-format. If pandas fails to guess the format (for example if your first string is
-``'01 December US/Pacific 2000'``), then a warning will be raised and each
-row will be parsed individually by ``dateutil.parser.parse``. The safest
+If you try to parse a column of date strings, pandas will attempt to find the format
+which work best from a sample of non-NaN elements, and will then parse the rest of the
+column with that format. If pandas fails to guess the format, then a warning will be
+raised and each row will be parsed individually by ``dateutil.parser.parse``. The safest
 way to parse dates is to explicitly set ``format=``.
 
 .. ipython:: python
@@ -994,7 +993,9 @@ way to parse dates is to explicitly set ``format=``.
    df
 
 In the case that you have mixed datetime formats within the same column, you can
-pass  ``format='mixed'``
+pass  ``format='mixed'``. Pandas will convert rows to the best format found (the one
+which matches the most rows), and then iteratively convert the remaining rows with the
+remaining formats.
 
 .. ipython:: python
 

From 6f155b5fd76925b23038a688a9608e7d5fd9559c Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 11:41:33 +0200
Subject: [PATCH 30/37] revert changed tests

---
 pandas/tests/tools/test_to_datetime.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 5f5f3bfb1d377..9e59f047142ed 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -1302,7 +1302,10 @@ def test_datetime_bool_arrays_mixed(self, cache):
             to_datetime([False, datetime.today()], cache=cache)
         with pytest.raises(
             ValueError,
-            match=(f"{PARSING_ERR_MSG}"),
+            match=(
+                r'^time data "True" doesn\'t match format "%Y%m%d", '
+                f"at position 1. {PARSING_ERR_MSG}$"
+            ),
         ):
             to_datetime(["20130101", True], cache=cache)
         tm.assert_index_equal(
@@ -2390,7 +2393,10 @@ def test_to_datetime_on_datetime64_series(self, cache):
     def test_to_datetime_with_space_in_series(self, cache):
         # GH 6428
         ser = Series(["10/18/2006", "10/18/2008", " "])
-        msg = rf"{PARSING_ERR_MSG}"
+        msg = (
+            r'^time data " " doesn\'t match format "%m/%d/%Y", '
+            rf"at position 2. {PARSING_ERR_MSG}$"
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, errors="raise", cache=cache)
         result_coerce = to_datetime(ser, errors="coerce", cache=cache)
@@ -2735,7 +2741,10 @@ def test_to_datetime_infer_datetime_format_consistent_format(
     def test_to_datetime_inconsistent_format(self, cache):
         data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
         ser = Series(np.array(data))
-        msg = f"{PARSING_ERR_MSG}"
+        msg = (
+            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
+            rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$'
+        )
         with pytest.raises(ValueError, match=msg):
             to_datetime(ser, cache=cache)
 

From 2b2648e7fc19008beed7fde85ae67642f14051c5 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 12:51:40 +0200
Subject: [PATCH 31/37] fix type hints

---
 doc/source/whatsnew/v0.19.0.rst |  2 +-
 pandas/core/tools/datetimes.py  | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index d4b879f137698..a0684db51c53e 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -765,7 +765,7 @@ Previously if ``.to_datetime()`` encountered mixed integers/floats and strings,
 This will now convert integers/floats with the default unit of ``ns``.
 
 .. ipython:: python
-
+   :okwarning:
    pd.to_datetime([1, "foo"], errors="coerce")
 
 Bug fixes related to ``.to_datetime()``:
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index bd5c128021a90..589d8ce9137dc 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -149,7 +149,7 @@ def _check_format_dayfirst(format_string: str) -> bool | None:
 
 
 def _try_to_repect_dayfirst(
-    formats: np.ndarray,
+    formats: list,
     dayfirst: bool | None,
     warn: bool,
 ) -> str:
@@ -241,7 +241,7 @@ def _guess_datetime_format_for_array(
     sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)]
     sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)]
     # try formats
-    formats_found = set()
+    formats_found = []
     for datetime_string in sample_find:
         # catch warnings from guess_datetime_format
         # which appears when dayfirst is contradicted
@@ -252,14 +252,17 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                formats_found.add(
+                formats_found.append(
                     guess_datetime_format(datetime_string, dayfirst=False)
                 )
-                formats_found.add(guess_datetime_format(datetime_string, dayfirst=True))
+                formats_found.append(
+                    guess_datetime_format(datetime_string, dayfirst=True)
+                )
+    formats_found = [format_ for format_ in formats_found if format_ is not None]
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
-    for format_ in list(formats_found):
-        if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_):
+    for format_ in np.unique(formats_found):
+        if re.match(r".*%Y.*%d.*%m.*", format_):
             # doesn't exist but is returned by guess_datetime_format
             formats_found.remove(format_)
     # Try to apply the formats found
@@ -283,25 +286,27 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
+    print(formats_checked)
     if not len(formats_checked):
         return None
     else:
         # Sort by the number of strings that match the format
         formats_checked.sort(key=lambda x: x[1], reverse=True)
         best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
+        print(best_format)
         return best_format
 
 
 def _iterative_conversion(
     arg: np.ndarray,
-    name: str,
+    name: Hashable,
     utc: bool,
     unit: str | None,
     errors: DateTimeErrorChoices,
     dayfirst: bool | None,
     yearfirst: bool | None,
     exact: bool,
-) -> tuple[np.ndarray, np.ndarray]:
+) -> Index:
     """
     For mixed format, convert datetimestrings iteratively,
     from the best format (the format which work for most samples)
@@ -312,7 +317,7 @@ def _iterative_conversion(
     arg : ndarray
         Array of datetime strings.
     name : str
-        Name of the argument.
+        None or string for the Index name
     utc : bool
         Whether to convert/localize timestamps to UTC.
     unit : str
@@ -537,7 +542,7 @@ def _convert_and_box_cache(
 
 
 def _return_parsed_timezone_results(
-    result: np.ndarray, timezones, utc: bool, name: str
+    result: np.ndarray, timezones, utc: bool, name: Hashable
 ) -> Index:
     """
     Return results from array_strptime if a %z or %Z directive was passed.
@@ -995,6 +1000,7 @@ def to_datetime(
         - "mixed", to allow for multiple formats. Values will be parsed iteratively
         using the most promising format at each step. This is risky,
         and you should probably use it along with `dayfirst`.
+
     exact : bool, default True
         Control how `format` is used:
 

From 3f02e0a5782b99ef0cc232c85fafe34ec70ec136 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 14:42:50 +0200
Subject: [PATCH 32/37] fix type hints for np.unique

---
 pandas/core/tools/datetimes.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 589d8ce9137dc..7c3299b57ddda 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -252,13 +252,12 @@ def _guess_datetime_format_for_array(
                 message="Parsing dates in .* format when dayfirst=.* was specified.",
             )
             if type(datetime_string) is str:
-                formats_found.append(
-                    guess_datetime_format(datetime_string, dayfirst=False)
-                )
-                formats_found.append(
-                    guess_datetime_format(datetime_string, dayfirst=True)
-                )
-    formats_found = [format_ for format_ in formats_found if format_ is not None]
+                for try_dayfirst in [False, True]:
+                    format_found = guess_datetime_format(
+                        datetime_string, dayfirst=try_dayfirst
+                    )
+                    if format_found is not None:
+                        formats_found.append(format_found)
     # remove YDM as it does not exist
     # but is returned by guess_datetime_format
     for format_ in np.unique(formats_found):

From feaa7a365285240c6008783568d8db3b1d8d3a50 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 15:24:32 +0200
Subject: [PATCH 33/37] remove prints

---
 pandas/core/tools/datetimes.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 7c3299b57ddda..401f9a8bc7909 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -285,14 +285,12 @@ def _guess_datetime_format_for_array(
             UserWarning,
             stacklevel=find_stack_level(),
         )
-    print(formats_checked)
     if not len(formats_checked):
         return None
     else:
         # Sort by the number of strings that match the format
         formats_checked.sort(key=lambda x: x[1], reverse=True)
         best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn)
-        print(best_format)
         return best_format
 
 

From 60148b18448c35de36cef1122027030b1c150822 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 17:12:59 +0200
Subject: [PATCH 34/37] fix doc

---
 doc/source/whatsnew/v0.19.0.rst |  1 +
 pandas/core/tools/datetimes.py  | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index a0684db51c53e..74a4bebef84c5 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -766,6 +766,7 @@ This will now convert integers/floats with the default unit of ``ns``.
 
 .. ipython:: python
    :okwarning:
+
    pd.to_datetime([1, "foo"], errors="coerce")
 
 Bug fixes related to ``.to_datetime()``:
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 401f9a8bc7909..c0a52a73c4f0d 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -213,16 +213,18 @@ def _guess_datetime_format_for_array(
     ----------
     arr : ndarray
         Array of datetime strings.
+    dayfirst : bool
+        dayfirst parsing behavior from to_datetime.
     n_find_format : int
         Number of strings to use to guess the format.
     n_check_format : int
         Number of strings to check for each format found.
-    warn: bool
-        Whether to warn if we contradict dayfirst
+    warn : bool
+        Whether to warn if we contradict dayfirst.
 
     Returns
     -------
-    formats : ndarray
+    ndarray
         Array of tuples with the format and the percentage of strings that
         match the format, sorted by the percentage of strings that match the
         format.
@@ -1181,8 +1183,8 @@ def to_datetime(
     If multiple datetime formats are possible for a value, pandas will try to infer
     the most plausible format using the other examples.
 
-    >>> pd.to_datetime(["01-02-2012", "30-01-2012"])
-    DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None)
+    >>> pd.to_datetime(["01-02-2012", "02-30-2012"])
+    DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None)
 
     .. _to_datetime_tz_examples:
 

From 6622eba05b28ec0544a10446d5f0df83a22c5e93 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 17:58:08 +0200
Subject: [PATCH 35/37] fix example with febuary 30th

---
 pandas/core/tools/datetimes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index c0a52a73c4f0d..922cb2f396702 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -1183,8 +1183,8 @@ def to_datetime(
     If multiple datetime formats are possible for a value, pandas will try to infer
     the most plausible format using the other examples.
 
-    >>> pd.to_datetime(["01-02-2012", "02-30-2012"])
-    DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None)
+    >>> pd.to_datetime(["01-02-2012", "02-27-2012"])
+    DatetimeIndex(['2012-01-02', '2012-02-27'], dtype='datetime64[ns]', freq=None)
 
     .. _to_datetime_tz_examples:
 

From 23b28b93585dd7f76a51a5ab52b4a84af89426ae Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Fri, 14 Apr 2023 20:21:28 +0200
Subject: [PATCH 36/37] fix doc

---
 pandas/core/tools/datetimes.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 922cb2f396702..59b44d8f8aba7 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -996,9 +996,10 @@ def to_datetime(
 
         - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
           time string (not necessarily in exactly the same format);
+
         - "mixed", to allow for multiple formats. Values will be parsed iteratively
-        using the most promising format at each step. This is risky,
-        and you should probably use it along with `dayfirst`.
+          using the most promising format at each step. This is risky,
+          and you should probably use it along with `dayfirst`.
 
     exact : bool, default True
         Control how `format` is used:

From 5422bfa3c257d9472533cfcb3f0fa3cc483495d3 Mon Sep 17 00:00:00 2001
From: LeoGrin <leo.grinsztajn@polytechnique.edu>
Date: Mon, 24 Apr 2023 16:09:19 +0200
Subject: [PATCH 37/37] check if any str at the beginning of
 _guess_datetime_format_for_array

---
 pandas/core/tools/datetimes.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 59b44d8f8aba7..9e4b67bb265c7 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -242,6 +242,9 @@ def _guess_datetime_format_for_array(
     step_check = max(len(arr_non_null) // n_check_format, 1)
     sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)]
     sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)]
+    if not np.any([type(e) is str for e in sample_find]):
+        # GH#32264 np.str_ objects
+        return None
     # try formats
     formats_found = []
     for datetime_string in sample_find:
@@ -274,20 +277,15 @@ def _guess_datetime_format_for_array(
         formats_checked.append(
             (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted)))
         )
-    if (
-        len(formats_checked) == 0
-        and len(sample_check) > 1
-        and np.any([type(e) is str for e in sample_find])
-        # GH#32264 np.str_ objects
-    ):
-        warnings.warn(
-            "Could not infer format, so each element will be parsed "
-            "individually, falling back to `dateutil`. To ensure parsing is "
-            "consistent and as-expected, please specify a format.",
-            UserWarning,
-            stacklevel=find_stack_level(),
-        )
     if not len(formats_checked):
+        if len(sample_check) > 1:
+            warnings.warn(
+                "Could not infer format, so each element will be parsed "
+                "individually, falling back to `dateutil`. To ensure parsing is "
+                "consistent and as-expected, please specify a format.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
         return None
     else:
         # Sort by the number of strings that match the format