diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 60353dde5683f..ec677ea1030c0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -977,11 +977,10 @@ Note that format inference is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. -If you try to parse a column of date strings, pandas will attempt to guess the format -from the first non-NaN element, and will then parse the rest of the column with that -format. If pandas fails to guess the format (for example if your first string is -``'01 December US/Pacific 2000'``), then a warning will be raised and each -row will be parsed individually by ``dateutil.parser.parse``. The safest +If you try to parse a column of date strings, pandas will attempt to find the format +which work best from a sample of non-NaN elements, and will then parse the rest of the +column with that format. If pandas fails to guess the format, then a warning will be +raised and each row will be parsed individually by ``dateutil.parser.parse``. The safest way to parse dates is to explicitly set ``format=``. .. ipython:: python @@ -994,7 +993,9 @@ way to parse dates is to explicitly set ``format=``. df In the case that you have mixed datetime formats within the same column, you can -pass ``format='mixed'`` +pass ``format='mixed'``. Pandas will convert rows to the best format found (the one +which matches the most rows), and then iteratively convert the remaining rows with the +remaining formats. .. ipython:: python diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index d4b879f137698..74a4bebef84c5 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -765,6 +765,7 @@ Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python + :okwarning: pd.to_datetime([1, "foo"], errors="coerce") diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b0e9fa2cea0ee..d5ad1e111dd64 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -16,8 +16,53 @@ Enhancements .. _whatsnew_210.enhancements.enhancement1: -enhancement1 -^^^^^^^^^^^^ +``pd.to_datetime`` now tries to infer the datetime format of each string by considering +a random sample (instead of the first non-null sample), +and tries to find the format which work for most strings. If several +formats work as well, the one which matches the ``dayfirst`` parameter is returned. If +``format="mixed"``, pandas does the same thing, then tries the second best format on the +strings which failed to parse with the first best format, and so on (:issue:`52508`). + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + ValueError: time data "30-01-2012" doesn't match format "%m-%d-%Y", at position 2. You might want to try: + - passing `format` if your strings have a consistent format; + - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format; + - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this. + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + DatetimeIndex(['2012-01-02', '2012-01-03', 'NaT'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + +*New behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. + Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', + freq=None) + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + .. _whatsnew_210.enhancements.enhancement2: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3e1b6070ffc39..9e4b67bb265c7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -4,6 +4,7 @@ from datetime import datetime from functools import partial from itertools import islice +import re from typing import ( TYPE_CHECKING, Callable, @@ -128,27 +129,251 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: - # Try to guess the format based on the first non-NaN element, return None if can't - if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: - # GH#32264 np.str_ object - guessed_format = guess_datetime_format( - first_non_nan_element, dayfirst=dayfirst +def _check_format_dayfirst(format_string: str) -> bool | None: + for char in ["%d", "%m", "%Y"]: + if char not in format_string: + return None + + if format_string.index("%d") < format_string.index("%m") and format_string.index( + "%m" + ) < format_string.index("%Y"): + dayfirst = True + elif format_string.index("%m") < format_string.index("%d") and format_string.index( + "%d" + ) < format_string.index("%Y"): + dayfirst = False + else: + dayfirst = None + + return dayfirst + + +def _try_to_repect_dayfirst( + formats: list, + dayfirst: bool | None, + warn: bool, +) -> str: + """ + If several formats work as well, prefer the format which + respect dayfirst. + + Parameters + ---------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + dayfirst : bool + Should we prefer dayfirst formats + + Returns + ------- + best_format : str + The format among the best formats which respect dayfirst, + if any, otherwise the first best format. + """ + # Find all formats which work for + # the largest number of samples + best_formats = [ + formats_found for formats_found in formats if formats_found[1] == formats[0][1] + ] + # If several formats work as well, prefer the format which + # respect dayfirst + if len(best_formats) > 1: + for formats_found in best_formats: + if _check_format_dayfirst(formats_found[0]) == dayfirst: + return formats_found[0] + if ( + warn + and _check_format_dayfirst(best_formats[0][0]) is not None + and _check_format_dayfirst(best_formats[0][0]) != dayfirst + ): + default_string = " (the default)" if not dayfirst else "" + warnings.warn( + f"Parsing dates in {best_formats[0][0]} format when " + f"dayfirst={dayfirst}{default_string} was specified. " + f"Pass `dayfirst={not dayfirst}` or specify a format " + "to silence this warning.", + stacklevel=find_stack_level(), + ) + return best_formats[0][0] + + +def _guess_datetime_format_for_array( + arr: np.ndarray, + dayfirst: bool | None, + n_find_format: int = 10, + n_check_format: int = 200, + warn: bool = True, +) -> str | None: + """ + Guess the format of the datetime strings in an array. + + Parameters + ---------- + arr : ndarray + Array of datetime strings. + dayfirst : bool + dayfirst parsing behavior from to_datetime. + n_find_format : int + Number of strings to use to guess the format. + n_check_format : int + Number of strings to check for each format found. + warn : bool + Whether to warn if we contradict dayfirst. + + Returns + ------- + ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + """ + # Extract a sample of datetime strings + # ignore missing + arr_non_null = arr[notna(arr)] + arr_non_null = arr_non_null[ + ~np.isin(arr_non_null, ["", "now", "today"] + list(nat_strings)) + ] + if len(arr_non_null) == 0: + return None + # get evenly spaced non-null indices + step_find = max(len(arr_non_null) // n_find_format, 1) + step_check = max(len(arr_non_null) // n_check_format, 1) + sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] + sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] + if not np.any([type(e) is str for e in sample_find]): + # GH#32264 np.str_ objects + return None + # try formats + formats_found = [] + for datetime_string in sample_find: + # catch warnings from guess_datetime_format + # which appears when dayfirst is contradicted + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Parsing dates in .* format when dayfirst=.* was specified.", ) - if guessed_format is not None: - return guessed_format - # If there are multiple non-null elements, warn about - # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) - return None + if type(datetime_string) is str: + for try_dayfirst in [False, True]: + format_found = guess_datetime_format( + datetime_string, dayfirst=try_dayfirst + ) + if format_found is not None: + formats_found.append(format_found) + # remove YDM as it does not exist + # but is returned by guess_datetime_format + for format_ in np.unique(formats_found): + if re.match(r".*%Y.*%d.*%m.*", format_): + # doesn't exist but is returned by guess_datetime_format + formats_found.remove(format_) + # Try to apply the formats found + # to a larger sample + formats_checked = [] + for format_ in formats_found: + converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0] + formats_checked.append( + (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) + ) + if not len(formats_checked): + if len(sample_check) > 1: + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually, falling back to `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) + return None + else: + # Sort by the number of strings that match the format + formats_checked.sort(key=lambda x: x[1], reverse=True) + best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) + return best_format + + +def _iterative_conversion( + arg: np.ndarray, + name: Hashable, + utc: bool, + unit: str | None, + errors: DateTimeErrorChoices, + dayfirst: bool | None, + yearfirst: bool | None, + exact: bool, +) -> Index: + """ + For mixed format, convert datetimestrings iteratively, + from the best format (the format which work for most samples) + to the worst. + + Parameters + ---------- + arg : ndarray + Array of datetime strings. + name : str + None or string for the Index name + utc : bool + Whether to convert/localize timestamps to UTC. + unit : str + None or string of the frequency of the passed data + errors : str + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + dayfirst : bool + dayfirst parsing behavior from to_datetime + yearfirst : bool + yearfirst parsing behavior from to_datetime + exact : bool, default True + exact format matching behavior from to_datetime + + """ + # iteratively convert the remaining samples + # in "coerce" mode with the ith best format + # or 10 formats have been tried + # if we contradict dayfirst, we warn for the first format, but not the rest + best_format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst, warn=True) + result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc) + indices_succeeded = notna(result) + for _ in range(10): + best_format = _guess_datetime_format_for_array( + arg[~indices_succeeded], dayfirst=dayfirst, warn=False + ) + + if best_format is None: + break + results_format, timezones_format = array_strptime( + arg[~indices_succeeded], best_format, exact, "coerce", utc + ) + indices_succeeded_small = notna(results_format) + update_indices = np.arange(len(result))[~indices_succeeded][ + indices_succeeded_small + ] + result[update_indices] = results_format[indices_succeeded_small] + tz_parsed[~indices_succeeded][indices_succeeded_small] = timezones_format[ + indices_succeeded_small + ] + indices_succeeded[~indices_succeeded] = indices_succeeded_small + if indices_succeeded.all(): + break + if not indices_succeeded.all(): + # if we exhausted all formats and still have missing values + if errors == "raise": + raise ValueError( + f"""Unable to parse "{arg[~indices_succeeded][0]}" as a date. + You can pass `errors="coerce"` or `errors="ignore"` to + ignore this error.""" + ) + elif errors == "coerce": + result[~indices_succeeded] = iNaT + elif errors == "ignore": + result = arg + + if any(tz is not None for tz in tz_parsed): + return _return_parsed_timezone_results(result, tz_parsed, utc, name) + + return _box_as_indexlike(result, utc=utc, name=name) def should_cache( @@ -314,7 +539,7 @@ def _convert_and_box_cache( def _return_parsed_timezone_results( - result: np.ndarray, timezones, utc: bool, name: str + result: np.ndarray, timezones, utc: bool, name: Hashable ) -> Index: """ Return results from array_strptime if a %z or %Z directive was passed. @@ -451,6 +676,11 @@ def _convert_listlike_datetimes( if format is not None and format != "mixed": return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + if format == "mixed": + return _iterative_conversion( + arg, name, utc, unit, errors, dayfirst, yearfirst, exact + ) + result, tz_parsed = objects_to_datetime64ns( arg, dayfirst=dayfirst, @@ -764,8 +994,11 @@ def to_datetime( - "ISO8601", to parse any `ISO8601 `_ time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, + + - "mixed", to allow for multiple formats. Values will be parsed iteratively + using the most promising format at each step. This is risky, and you should probably use it along with `dayfirst`. + exact : bool, default True Control how `format` is used: @@ -944,6 +1177,14 @@ def to_datetime( >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT + **Ambiguous format** + + If multiple datetime formats are possible for a value, pandas will try to infer + the most plausible format using the other examples. + + >>> pd.to_datetime(["01-02-2012", "02-27-2012"]) + DatetimeIndex(['2012-01-02', '2012-02-27'], dtype='datetime64[ns]', freq=None) + .. _to_datetime_tz_examples: **Timezones and time offsets** diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 55efb9254ee34..5c82b652ebace 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1794,11 +1794,13 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - with pytest.raises( - ValueError, + with tm.assert_produces_warning( + UserWarning, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" + "Parsing dates in %d/%m/%Y format when " + "dayfirst=False \\(the default\\) was specified. " + "Pass `dayfirst=True` or specify a format " + "to silence this warning." ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) @@ -2008,10 +2010,9 @@ def test_dayfirst_warnings(): tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res6 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index + res6 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index tm.assert_index_equal(expected, res6) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 0b5696116e610..9e59f047142ed 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1652,11 +1652,11 @@ def test_mixed_offsets_with_native_datetime_raises(self): mixed = to_datetime(ser) expected = Series( [ - "NaT", + NaT, Timestamp("1990-01-01"), Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(), Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(), - None, + NaT, # TODO check ], dtype=object, ) @@ -1855,7 +1855,7 @@ def test_unit_with_numeric(self, cache, errors, dtype): [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], - None, + UserWarning, ], ], ) @@ -2467,6 +2467,7 @@ def test_to_datetime_strings_vs_constructor(self, result): expected = Timestamp(2012, 1, 1) assert result == expected + @pytest.mark.filterwarnings("ignore:Could not infer format") def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 @@ -2659,10 +2660,7 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, - match=( - r'^time data "03/30/2011" doesn\'t match format ' - rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' - ), + match=(rf"{PARSING_ERR_MSG}"), ): to_datetime(arr, dayfirst=True) @@ -2681,30 +2679,43 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @pytest.mark.parametrize( - "test_list", + "test_list, expected_format", [ - [ - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - ], - [np.nan, np.nan, "2011-12-30 00:00:00.000000"], - ["", "2011-12-30 00:00:00.000000"], - ["NaT", "2011-12-30 00:00:00.000000"], - ["2011-12-30 00:00:00.000000", "random_string"], - ["now", "2011-12-30 00:00:00.000000"], - ["today", "2011-12-30 00:00:00.000000"], + ( + [ + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + ], + "%Y-%m-%d %H:%M:%S.%f", + ), + ([np.nan, np.nan, "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["NaT", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["2011-12-30 00:00:00.000000", "random_string"], "%Y-%m-%d %H:%M:%S.%f"), + (["now", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["today", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + ( + ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"], + "%m-%d-%Y %H:%M:%S.%f", + ), + (["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], "%m/%d/%Y"), ], ) - def test_guess_datetime_format_for_array(self, test_list): - expected_format = "%Y-%m-%d %H:%M:%S.%f" + def test_guess_datetime_format_for_array(self, test_list, expected_format): test_array = np.array(test_list, dtype=object) - assert tools._guess_datetime_format_for_array(test_array) == expected_format + res = tools._guess_datetime_format_for_array( + test_array, dayfirst=False, n_find_format=5, n_check_format=5 + ) + assert res == expected_format @td.skip_if_not_us_locale def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array([np.nan, np.nan, np.nan], dtype="O") + np.array([np.nan, np.nan, np.nan], dtype="O"), + dayfirst=False, + n_find_format=5, + n_check_format=5, ) assert format_for_string_of_nans is None @@ -3584,3 +3595,230 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): result = to_datetime(ser) expected = Series([1, 2], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) + + +class TestParsingMultipleDates: + # TODO handle yearfirst + @pytest.mark.parametrize( + "date_str, expected_format, dayfirst", + [ + (["2010-01-01", "2010-02-02", "2010-01-03"], "%Y-%m-%d", None), + (["2010-01-01", "2010-02-13", "2010-01-03"], "%Y-%m-%d", None), + (["01-01-2012", "01-13-2012", "01-03-2010"], "%m-%d-%Y", False), + (["01-01-2012", "13-01-2012", "01-03-2010"], "%d-%m-%Y", True), + ], + ) + def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst): + # only one format is possible + expected = to_datetime(date_str, format=expected_format) + + # all errors should prefer the format + # which works for all dates + for errors in ["raise", "coerce", "ignore"]: + for try_dayfirst in [True, False]: + # warn if we contradict dayfirst + # we don't warn when format is "%Y-%m-%d" + # TODO same for yearfirst + if dayfirst is not None and try_dayfirst != dayfirst: + with tm.assert_produces_warning(UserWarning): + result = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst + ) + # should also work for format="mixed" + result_mixed = to_datetime( + date_str, + errors=errors, + dayfirst=try_dayfirst, + format="mixed", + ) + else: + result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst) + result_mixed = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst, format="mixed" + ) + tm.assert_index_equal(result, expected) + tm.assert_index_equal(result_mixed, expected) + + # ambiguous dates + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y") + expected_not_dayfirst = to_datetime(date_str, format="%m-%d-%Y") + + for errors in ["raise", "coerce", "ignore"]: + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # # should also work with format="mixed" + result_dayfirst = to_datetime(date_str, format="mixed", dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, format="mixed", dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # ambiguous dates with errors + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "random_string", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "random_string", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous_error(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, + match="""time data "random_string" doesn't match format "%d-%m-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", + ): + to_datetime(date_str, errors="raise", dayfirst=True) + with pytest.raises( + ValueError, + match="""time data "random_string" doesn't match format "%m-%d-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", + ): + to_datetime(date_str, errors="raise", dayfirst=False) + + # same with mixed + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=True, format="mixed" + ) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=False, format="mixed" + ) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=True, format="mixed") + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=False, format="mixed") + + # mixed formats + @pytest.mark.parametrize( + "date_str, expected_formats, expected_mixed", + [ + ( + [ + "01-02-2012", + "13-05-2012", + "14-03-2010", + "03-13-2012", + "15-05-2012", + "03-13-2010", + ], + ["%d-%m-%Y", "%m-%d-%Y"], + DatetimeIndex( + [ + "2012-02-01", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ( + [ + "01-02-2012", + "05-13-2012", + "03-14-2010", + "13-03-2012", + "05-15-2012", + "13-03-2010", + ], + ["%m-%d-%Y", "%d-%m-%Y"], + DatetimeIndex( + [ + "2012-01-02", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ], + ) + def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): + # no format works for all dates + # raise should raise an error + msg = r'^time data ".*" doesn\'t match format ".*", at position .*' + with pytest.raises( + ValueError, + match=msg, + ): + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + # FIXME: do we need to raise a warning here? + to_datetime(date_str, errors="raise") + else: + to_datetime(date_str, errors="raise") + # coerce and ignore should choose the format + # which works for the most dates (the first one) + for errors in ["coerce", "ignore"]: + expected = to_datetime(date_str, format=expected_formats[0], errors=errors) + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, errors=errors) + else: + result = to_datetime(date_str, errors=errors) + tm.assert_index_equal(result, expected) + + # if format="mixed", the conversion should be done from the best format + # to the worst format + for errors in ["raise", "coerce", "ignore"]: + if expected_formats[0] == "%d-%m-%Y": + # we raise a warning if the best format used + # (the one which works for the most dates) + # contradict the default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, format="mixed", errors=errors) + else: + # we don't raise a warning if other formats used + # contradict dayfirst + result = to_datetime(date_str, format="mixed", errors=errors) + tm.assert_index_equal(result, expected_mixed)