From fd7a534d7780ba26a13dc85eb83c3ff2107cb360 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 01:52:18 +0200 Subject: [PATCH 01/37] All tests pass --- pandas/_libs/tslib.pyx | 34 ++- pandas/core/tools/datetimes.py | 311 +++++++++++++++++++++---- pandas/tests/tools/test_to_datetime.py | 309 +++++++++++++++++++++--- 3 files changed, 575 insertions(+), 79 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 106f203a16855..8b790e3bd8adc 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -69,17 +69,22 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp +import cython + from pandas._libs.tslibs import ( Resolution, get_resolution, ) from pandas._libs.tslibs.timestamps import Timestamp -# Note: this is the only non-tslibs intra-pandas dependency here +from libc.stdlib cimport srand +from libc.time cimport time from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single +# Note: this is the only non-tslibs intra-pandas dependency here + def _test_parse_iso8601(ts: str): """ @@ -398,6 +403,33 @@ def first_non_null(values: ndarray) -> int: return -1 +@cython.wraparound(False) +@cython.boundscheck(False) +def random_non_null(values: ndarray, int n) -> ndarray: + """Find n non-null values selected at random, return an array of indices.""" + cdef: + Py_ssize_t total = len(values) + Py_ssize_t i, non_null_count + list non_null_indices = [] + srand(time(NULL)) + for i in range(total): + val = values[i] + if checknull_with_nat_and_na(val): + continue + if ( + isinstance(val, str) + and + (len(val) == 0 or val in nat_strings or val in ("now", "today")) + ): + continue + non_null_indices.append(i) + non_null_count = len(non_null_indices) + if non_null_count == 0 or n <= 0: + return np.empty(0, dtype=np.int64) + # use np.random.choice + return np.random.choice(non_null_indices, min(n, non_null_count), replace=False) + + @cython.wraparound(False) @cython.boundscheck(False) cpdef array_to_datetime( diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ac0a014a3ccf6..e785fe400c631 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -4,6 +4,7 @@ from datetime import datetime from functools import partial from itertools import islice +import re from typing import ( TYPE_CHECKING, Callable, @@ -129,27 +130,207 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: - # Try to guess the format based on the first non-NaN element, return None if can't - if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: - # GH#32264 np.str_ object - guessed_format = guess_datetime_format( - first_non_nan_element, dayfirst=dayfirst +def _check_format_dayfirst(format_string): + dayfirst = False + for char in ["%d", "%m", "%Y"]: + if char not in format_string: + return None + + if format_string.index("%d") < format_string.index("%m") and format_string.index( + "%m" + ) < format_string.index("%Y"): + dayfirst = True + elif format_string.index("%m") < format_string.index("%d") and format_string.index( + "%d" + ) < format_string.index("%Y"): + dayfirst = False + else: + dayfirst = None + + return dayfirst + + +def _guess_datetime_format_for_array( + arr, n_find_format, n_check_format +) -> ArrayLike[tuple[str, str]]: + """ + Guess the format of the datetime strings in an array. + + Parameters + ---------- + arr : ndarray + Array of datetime strings. + n_find_format : int + Number of strings to use to guess the format. + n_check_format : int + Number of strings to check for each format found. + + Returns + ------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + """ + # Extract a random sample of datetime strings + assert ( + n_find_format <= n_check_format + ), "n_check_format must be greater than n_find_format" + sample_idx = tslib.random_non_null(arr, n_check_format) + sample_check = arr[sample_idx] + sample_find = sample_check[:n_find_format] + if len(sample_idx) == 0: + return [] # FIXME + format_found = set() + for datetime_string in sample_find: + # catch warnings from guess_datetime_format + # which appears when dayfirst is contradicted + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Parsing dates in .* format when dayfirst=.* was specified.", ) - if guessed_format is not None: - return guessed_format - # If there are multiple non-null elements, warn about - # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) - return None + if type(datetime_string) is str: + format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) + format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + if None in format_found: + format_found.remove(None) + # remove YDM as it does not exist + # but is returned by guess_datetime_format + for format in list(format_found): + if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format): + # doesn't exist but is returned by guess_datetime_format + # FIXME + format_found.remove(format) + # Try to apply the formats found + # to a larger sample + format_checked = [] + for format in format_found: + converted = array_strptime(sample_check, fmt=format, errors="coerce")[0] + format_checked.append( + (format, int(100 * np.sum(~np.isnan(converted)) / len(converted))) + ) + # Sort by the number of strings that match the format + format_checked.sort(key=lambda x: x[1], reverse=True) + if ( + len(format_checked) == 0 + and len(sample_check) > 1 + and np.any([type(e) is str for e in sample_find]) + # GH#32264 np.str_ objects + ): + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually, falling back to `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) + return np.array(format_checked, dtype=object) + + +def _try_to_repect_dayfirst(formats, dayfirst): + """ + If several formats work as well, prefer the format which + respect dayfirst. + + Parameters + ---------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + dayfirst : bool + Should we prefer dayfirst formats + + Returns + ------- + best_format : str + The format among the best formats which respect dayfirst, + if any, otherwise the first best format. + """ + # Find all formats which work for + # the largest number of samples + best_formats = [ + format_found for format_found in formats if format_found[1] == formats[0][1] + ] + # If several formats work as well, prefer the format which + # respect dayfirst + if len(best_formats) > 1: + for format_found in best_formats: + if _check_format_dayfirst(format_found[0]) == dayfirst: + return format_found[0], _check_format_dayfirst(format_found[0]) + return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) + + +def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact): + """ + For mixed format, convert datetimestrings iteratively, + from the best format (the format which work for most samples) + to the worst. + + Parameters + ---------- + arg : ndarray + Array of datetime strings. + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + utc : bool + Whether to convert/localize timestamps to UTC. + unit : str + None or string of the frequency of the passed data + errors : str + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + dayfirst : bool + dayfirst parsing behavior from to_datetime + yearfirst : bool + yearfirst parsing behavior from to_datetime + exact : bool, default True + exact format matching behavior from to_datetime + + """ + # iteratively convert the remaining samples + # in "coerce" mode with the ith best format + # until all values are converted or all formats are exhausted + # or 10 formats have been tried + best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] + # remove the best format from the list + formats = formats[formats[:, 0] != best_format] + result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc) + indices_succeeded = notna(result) + for _ in range(min(len(formats), 10)): + best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] + formats = formats[formats[:, 0] != best_format] + results_format, timezones_format = array_strptime( + arg[~indices_succeeded], best_format, exact, "coerce", utc + ) + indices_succeeded_small = notna(results_format) + update_indices = np.arange(len(result))[~indices_succeeded][ + indices_succeeded_small + ] + result[update_indices] = results_format[indices_succeeded_small] + tz_parsed[~indices_succeeded][indices_succeeded_small] = timezones_format[ + indices_succeeded_small + ] + indices_succeeded[~indices_succeeded] = indices_succeeded_small + if indices_succeeded.all(): + break + if not indices_succeeded.all(): + # if we exhausted all formats and still have missing values + if errors == "raise": + raise ValueError( + f"""Unable to parse "{arg[~indices_succeeded][0]}" as a date. + You can pass `errors="coerce"` or `errors="ignore"` to + ignore this error.""" + ) + elif errors == "coerce": + result[~indices_succeeded] = iNaT + elif errors == "ignore": + # TODO check + result = arg + return result, tz_parsed def should_cache( @@ -445,27 +626,64 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - if format is None: - format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - - # `format` could be inferred, or user didn't ask for mixed-format parsing. + # get the list of formats which work for some of the elements + # sorted by the percentage of elements that match, highest first + # It's a list of tuples of (format, percentage of elements that match) + best_format = None if format is not None and format != "mixed": - return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) - - result, tz_parsed = objects_to_datetime64ns( - arg, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - errors=errors, - allow_object=True, - ) - - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) + best_format = format + else: + # guess the format + formats = _guess_datetime_format_for_array( + arg, n_find_format=20, n_check_format=250 + ) + if len(formats) == 0: + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + allow_object=True, + ) + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) + if format != "mixed" and len(formats) > 0: + # formats[0][1] is the percentage of elements that matched + if errors == "raise" and formats[0][1] != 100: + raise ValueError( + "No datetime format was found which " + "matched all values in the array.\n" + "You might want to try:\n" + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be " + "inferred for each element individually. " + "You might want to use `dayfirst` alongside this.\n" + f"Best format found: {formats[0][0]} " + "(matched {formats[0][1]}% of the values)" + ) + best_format, best_format_dayfirst = _try_to_repect_dayfirst( + formats, dayfirst + ) + if best_format_dayfirst is not None and best_format_dayfirst != dayfirst: + warnings.warn( + f"Parsing dates in {best_format} format when " + f"dayfirst={dayfirst} was specified. " + f"Pass `dayfirst={not dayfirst}` or specify a format " + "to silence this warning.", + stacklevel=find_stack_level(), + ) + if best_format is not None: + return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors) + if format == "mixed": + result, tz_parsed = _iterative_conversion( + arg, formats, utc, unit, errors, dayfirst, yearfirst, exact + ) return _box_as_indexlike(result, utc=utc, name=name) @@ -765,8 +983,9 @@ def to_datetime( - "ISO8601", to parse any `ISO8601 `_ time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + - "mixed", to allow for multiple formats. Values will be parsed iteratively + using the most promising format at each step. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: @@ -945,6 +1164,14 @@ def to_datetime( >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT + **Ambiguous format** + + If multiple datetime formats are possible for a value, pandas will try to infer + the most plausible format using the other examples. + + >>> pd.to_datetime(["01-02-2012", "30-01-2012"]) + DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + .. _to_datetime_tz_examples: **Timezones and time offsets** diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7b707be97c653..326810177fb3a 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1313,10 +1313,7 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=( - r'^time data "True" doesn\'t match format "%Y%m%d", ' - f"at position 1. {PARSING_ERR_MSG}$" - ), + match=(f"{PARSING_ERR_MSG}"), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -1663,11 +1660,11 @@ def test_mixed_offsets_with_native_datetime_raises(self): mixed = to_datetime(ser) expected = Series( [ - "NaT", + NaT, Timestamp("1990-01-01"), Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(), Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(), - None, + NaT, # TODO check ], dtype=object, ) @@ -1866,7 +1863,7 @@ def test_unit_with_numeric(self, cache, errors, dtype): [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], - None, + UserWarning, ], ], ) @@ -2404,10 +2401,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = ( - r'^time data " " doesn\'t match format "%m/%d/%Y", ' - rf"at position 2. {PARSING_ERR_MSG}$" - ) + msg = rf"{PARSING_ERR_MSG}" with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2478,6 +2472,7 @@ def test_to_datetime_strings_vs_constructor(self, result): expected = Timestamp(2012, 1, 1) assert result == expected + @pytest.mark.filterwarnings("ignore:Could not infer format") def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 @@ -2670,10 +2665,7 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, - match=( - r'^time data "03/30/2011" doesn\'t match format ' - rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' - ), + match=(rf"{PARSING_ERR_MSG}"), ): to_datetime(arr, dayfirst=True) @@ -2692,32 +2684,77 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @pytest.mark.parametrize( - "test_list", + "test_list, expected_formats", [ - [ - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - ], - [np.nan, np.nan, "2011-12-30 00:00:00.000000"], - ["", "2011-12-30 00:00:00.000000"], - ["NaT", "2011-12-30 00:00:00.000000"], - ["2011-12-30 00:00:00.000000", "random_string"], - ["now", "2011-12-30 00:00:00.000000"], - ["today", "2011-12-30 00:00:00.000000"], + ( + [ + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + ], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + [np.nan, np.nan, "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["NaT", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["2011-12-30 00:00:00.000000", "random_string"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object), + ), + ( + ["now", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["today", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"], + np.array( + [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)], + dtype=object, + ), + ), + ( + ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], + np.array( + [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)], + dtype=object, + ), + ), ], ) - def test_guess_datetime_format_for_array(self, test_list): - expected_format = "%Y-%m-%d %H:%M:%S.%f" + def test_guess_datetime_format_for_array(self, test_list, expected_formats): test_array = np.array(test_list, dtype=object) - assert tools._guess_datetime_format_for_array(test_array) == expected_format + res = tools._guess_datetime_format_for_array( + test_array, n_find_format=5, n_check_format=5 + ) + # sort according to first element of tuple (format string) to ignore order + sorted_index = np.argsort([x[0] for x in res]) + res = res[sorted_index] + sorted_index = np.argsort([x[0] for x in expected_formats]) + expected_formats = expected_formats[sorted_index] + assert (res == expected_formats).all() + # TODO more tests @td.skip_if_not_us_locale def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array([np.nan, np.nan, np.nan], dtype="O") + np.array([np.nan, np.nan, np.nan], dtype="O"), + n_find_format=5, + n_check_format=5, ) - assert format_for_string_of_nans is None + assert len(format_for_string_of_nans) == 0 class TestToDatetimeInferFormat: @@ -2741,10 +2778,7 @@ def test_to_datetime_infer_datetime_format_consistent_format( def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) - msg = ( - r'^time data "01-02-2011 00:00:00" doesn\'t match format ' - rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' - ) + msg = f"{PARSING_ERR_MSG}" with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -3595,3 +3629,206 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): result = to_datetime(ser) expected = Series([1, 2], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) + + +class TestParsingMultipleDates: + # TODO handle yearfirst + @pytest.mark.parametrize( + "date_str, expected_format, dayfirst", + [ + (["2010-01-01", "2010-02-02", "2010-01-03"], "%Y-%m-%d", None), + (["2010-01-01", "2010-02-13", "2010-01-03"], "%Y-%m-%d", None), + (["01-01-2012", "01-13-2012", "01-03-2010"], "%m-%d-%Y", False), + (["01-01-2012", "13-01-2012", "01-03-2010"], "%d-%m-%Y", True), + ], + ) + def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst): + # only one format is possible + expected = to_datetime(date_str, format=expected_format) + + # all errors should prefer the format + # which works for all dates + for errors in ["raise", "coerce", "ignore"]: + for try_dayfirst in [True, False]: + # warn if we contradict dayfirst + # we don't warn when format is "%Y-%m-%d" + # TODO same for yearfirst + if dayfirst is not None and try_dayfirst != dayfirst: + with tm.assert_produces_warning(UserWarning): + result = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst + ) + else: + result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst) + tm.assert_index_equal(result, expected) + + # should also work with format="mixed" + result = to_datetime(date_str, format="mixed") + tm.assert_index_equal(result, expected) + + # ambiguous dates + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y") + expected_not_dayfirst = to_datetime(date_str, format="%m-%d-%Y") + + for errors in ["raise", "coerce", "ignore"]: + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # # should also work with format="mixed" + result_dayfirst = to_datetime(date_str, format="mixed", dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, format="mixed", dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # ambiguous dates with errors + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "random_string", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "random_string", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous_error(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise", dayfirst=True) + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise", dayfirst=False) + + # same with mixed + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=True, format="mixed" + ) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=False, format="mixed" + ) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=True, format="mixed") + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=False, format="mixed") + + # mixed formats + @pytest.mark.parametrize( + "date_str, expected_formats, expected_mixed", + [ + ( + [ + "01-02-2012", + "13-05-2012", + "14-03-2010", + "03-13-2012", + "15-05-2012", + "03-13-2010", + ], + ["%d-%m-%Y", "%m-%d-%Y"], + DatetimeIndex( + [ + "2012-02-01", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ( + [ + "01-02-2012", + "05-13-2012", + "03-14-2010", + "13-03-2012", + "05-15-2012", + "13-03-2010", + ], + ["%m-%d-%Y", "%d-%m-%Y"], + DatetimeIndex( + [ + "2012-01-02", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ], + ) + def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): + # no format works for all dates + # raise should raise an error + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise") + + # coerce and ignore should choose the format + # which works for the most dates (the first one) + for errors in ["coerce", "ignore"]: + expected = to_datetime(date_str, format=expected_formats[0], errors=errors) + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, errors=errors) + else: + result = to_datetime(date_str, errors=errors) + tm.assert_index_equal(result, expected) + + # if format="mixed", the conversion should be done from the best format + # to the worst format + result = to_datetime(date_str, format="mixed") + tm.assert_index_equal(result, expected_mixed) + + # TODO multiple precision + # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None) From 93f9c7aeb4f0be29755bd1147a2bb521bb568293 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 14:26:35 +0200 Subject: [PATCH 02/37] Update changelog --- doc/source/whatsnew/v2.1.0.rst | 48 +++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 38161a29a9ff7..e61a4d7702da7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -16,7 +16,53 @@ Enhancements .. _whatsnew_210.enhancements.enhancement1: -enhancement1 +``pd.to_datetime`` now tries to infer the datetime format of each string by considering +the whole Series, and tries to find the format which work for most strings. If several +formats work as well, the one which matches the ``dayfirst`` parameter is returned. If +``format="mixed"``, pandas does the same thing, then tries the second best format on the +strings which failed to parse with the first best format, and so on (:issue:`52508`). + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + ValueError: time data "30-01-2012" doesn't match format "%m-%d-%Y", at position 2. You might want to try: + - passing `format` if your strings have a consistent format; + - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format; + - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this. + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + DatetimeIndex(['2012-01-02', '2012-01-03', 'NaT'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + +*New behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. + Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', + freq=None) + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + ^^^^^^^^^^^^ .. _whatsnew_210.enhancements.enhancement2: From c37d40fa83002a73ddb1d50e8d759ee318127fbd Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 14:49:47 +0200 Subject: [PATCH 03/37] Add missing type hints --- pandas/_libs/tslib.pyi | 1 + pandas/core/tools/datetimes.py | 43 ++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 9819b5173db56..bd8748cd2650a 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -17,6 +17,7 @@ def array_with_unit_to_datetime( errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... +def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ... def array_to_datetime( values: npt.NDArray[np.object_], errors: str = ..., diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e785fe400c631..1a1195a536584 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -130,8 +130,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _check_format_dayfirst(format_string): - dayfirst = False +def _check_format_dayfirst(format_string: str) -> bool | None: for char in ["%d", "%m", "%Y"]: if char not in format_string: return None @@ -151,8 +150,10 @@ def _check_format_dayfirst(format_string): def _guess_datetime_format_for_array( - arr, n_find_format, n_check_format -) -> ArrayLike[tuple[str, str]]: + arr: np.ndarray, + n_find_format: int, + n_check_format: int, +) -> np.ndarray: """ Guess the format of the datetime strings in an array. @@ -180,7 +181,7 @@ def _guess_datetime_format_for_array( sample_check = arr[sample_idx] sample_find = sample_check[:n_find_format] if len(sample_idx) == 0: - return [] # FIXME + return np.array([], dtype=object) format_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format @@ -194,22 +195,19 @@ def _guess_datetime_format_for_array( if type(datetime_string) is str: format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) - if None in format_found: - format_found.remove(None) # remove YDM as it does not exist # but is returned by guess_datetime_format - for format in list(format_found): - if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format): + for format_ in list(format_found): + if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): # doesn't exist but is returned by guess_datetime_format - # FIXME - format_found.remove(format) + format_found.remove(format_) # Try to apply the formats found # to a larger sample format_checked = [] - for format in format_found: - converted = array_strptime(sample_check, fmt=format, errors="coerce")[0] + for format_ in format_found: + converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0] format_checked.append( - (format, int(100 * np.sum(~np.isnan(converted)) / len(converted))) + (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) ) # Sort by the number of strings that match the format format_checked.sort(key=lambda x: x[1], reverse=True) @@ -229,7 +227,10 @@ def _guess_datetime_format_for_array( return np.array(format_checked, dtype=object) -def _try_to_repect_dayfirst(formats, dayfirst): +def _try_to_repect_dayfirst( + formats: np.ndarray, + dayfirst: bool | None, +) -> tuple[str, bool | None]: """ If several formats work as well, prefer the format which respect dayfirst. @@ -263,7 +264,16 @@ def _try_to_repect_dayfirst(formats, dayfirst): return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) -def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact): +def _iterative_conversion( + arg: np.ndarray, + formats: np.ndarray, + utc: bool, + unit: str | None, + errors: DateTimeErrorChoices, + dayfirst: bool | None, + yearfirst: bool | None, + exact: bool, +) -> tuple[np.ndarray, np.ndarray]: """ For mixed format, convert datetimestrings iteratively, from the best format (the format which work for most samples) @@ -328,7 +338,6 @@ def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, elif errors == "coerce": result[~indices_succeeded] = iNaT elif errors == "ignore": - # TODO check result = arg return result, tz_parsed From cbb5e0df0634abdfd79ab792c1f084d4405ce993 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:45:46 +0200 Subject: [PATCH 04/37] Cleaning --- pandas/_libs/tslib.pyx | 4 ---- pandas/tests/tools/test_to_datetime.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8b790e3bd8adc..9b2ca61e12c29 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -69,8 +69,6 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp -import cython - from pandas._libs.tslibs import ( Resolution, get_resolution, @@ -83,8 +81,6 @@ from libc.time cimport time from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single -# Note: this is the only non-tslibs intra-pandas dependency here - def _test_parse_iso8601(ts: str): """ diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 326810177fb3a..f33dc1603d4d5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3829,6 +3829,3 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # to the worst format result = to_datetime(date_str, format="mixed") tm.assert_index_equal(result, expected_mixed) - - # TODO multiple precision - # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None) From 81664a21bb9f22afd3d2dfc43bd6894a7b8e8a35 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:48:27 +0200 Subject: [PATCH 05/37] Typo --- doc/source/whatsnew/v2.1.0.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e61a4d7702da7..861ecd6377c24 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -17,7 +17,8 @@ Enhancements .. _whatsnew_210.enhancements.enhancement1: ``pd.to_datetime`` now tries to infer the datetime format of each string by considering -the whole Series, and tries to find the format which work for most strings. If several +a random sample (instead of the first non-null sample), +and tries to find the format which work for most strings. If several formats work as well, the one which matches the ``dayfirst`` parameter is returned. If ``format="mixed"``, pandas does the same thing, then tries the second best format on the strings which failed to parse with the first best format, and so on (:issue:`52508`). @@ -63,8 +64,6 @@ strings which failed to parse with the first best format, and so on (:issue:`525 DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) -^^^^^^^^^^^^ - .. _whatsnew_210.enhancements.enhancement2: ``map(func, na_action="ignore")`` now works for all array types From ef33ba0d0e4987f315a6086a18878e5996de2fbf Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:52:00 +0200 Subject: [PATCH 06/37] comment change --- pandas/core/tools/datetimes.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1a1195a536584..0f0b91b8005ab 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -635,14 +635,13 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - # get the list of formats which work for some of the elements - # sorted by the percentage of elements that match, highest first - # It's a list of tuples of (format, percentage of elements that match) best_format = None if format is not None and format != "mixed": best_format = format else: - # guess the format + # get a list of formats which work for some of the elements + # sorted by the percentage of elements that match, highest first + # It's a list of tuples of (format, percentage of elements that match) formats = _guess_datetime_format_for_array( arg, n_find_format=20, n_check_format=250 ) From e1652f15b029a47956aa2d36beabc153faf0fcec Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 13 Apr 2023 13:07:32 +0200 Subject: [PATCH 07/37] simplification --- pandas/core/tools/datetimes.py | 238 ++++++++++++------------- pandas/tests/tools/test_to_datetime.py | 111 ++++++------ 2 files changed, 170 insertions(+), 179 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0f0b91b8005ab..61ac201fc3662 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -149,11 +149,63 @@ def _check_format_dayfirst(format_string: str) -> bool | None: return dayfirst +def _try_to_repect_dayfirst( + formats: np.ndarray, + dayfirst: bool | None, + warn: bool, +) -> str: + """ + If several formats work as well, prefer the format which + respect dayfirst. + + Parameters + ---------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + dayfirst : bool + Should we prefer dayfirst formats + + Returns + ------- + best_format : str + The format among the best formats which respect dayfirst, + if any, otherwise the first best format. + """ + # Find all formats which work for + # the largest number of samples + best_formats = [ + formats_found for formats_found in formats if formats_found[1] == formats[0][1] + ] + # If several formats work as well, prefer the format which + # respect dayfirst + if len(best_formats) > 1: + for formats_found in best_formats: + if _check_format_dayfirst(formats_found[0]) == dayfirst: + return formats_found[0] + if ( + warn + and _check_format_dayfirst(best_formats[0][0]) is not None + and _check_format_dayfirst(best_formats[0][0]) != dayfirst + ): + warnings.warn( + f"Parsing dates in {best_formats[0][0]} format when " + f"dayfirst={dayfirst} was specified. " + f"Pass `dayfirst={not dayfirst}` or specify a format " + "to silence this warning.", + stacklevel=find_stack_level(), + ) + return best_formats[0][0] + + def _guess_datetime_format_for_array( arr: np.ndarray, - n_find_format: int, - n_check_format: int, -) -> np.ndarray: + dayfirst: bool | None, + n_find_format: int = 10, + n_check_format: int = 200, + warn: bool = True, +) -> str | None: """ Guess the format of the datetime strings in an array. @@ -165,6 +217,8 @@ def _guess_datetime_format_for_array( Number of strings to use to guess the format. n_check_format : int Number of strings to check for each format found. + warn: bool + Whether to warn if we contradict dayfirst Returns ------- @@ -181,8 +235,8 @@ def _guess_datetime_format_for_array( sample_check = arr[sample_idx] sample_find = sample_check[:n_find_format] if len(sample_idx) == 0: - return np.array([], dtype=object) - format_found = set() + return None + formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format # which appears when dayfirst is contradicted @@ -193,26 +247,26 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) - format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + formats_found.add( + guess_datetime_format(datetime_string, dayfirst=False) + ) + formats_found.add(guess_datetime_format(datetime_string, dayfirst=True)) # remove YDM as it does not exist # but is returned by guess_datetime_format - for format_ in list(format_found): - if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): + for format_ in list(formats_found): + if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): # doesn't exist but is returned by guess_datetime_format - format_found.remove(format_) + formats_found.remove(format_) # Try to apply the formats found # to a larger sample - format_checked = [] - for format_ in format_found: + formats_checked = [] + for format_ in formats_found: converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0] - format_checked.append( + formats_checked.append( (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) ) - # Sort by the number of strings that match the format - format_checked.sort(key=lambda x: x[1], reverse=True) if ( - len(format_checked) == 0 + len(formats_checked) == 0 and len(sample_check) > 1 and np.any([type(e) is str for e in sample_find]) # GH#32264 np.str_ objects @@ -224,49 +278,18 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) - return np.array(format_checked, dtype=object) - - -def _try_to_repect_dayfirst( - formats: np.ndarray, - dayfirst: bool | None, -) -> tuple[str, bool | None]: - """ - If several formats work as well, prefer the format which - respect dayfirst. - - Parameters - ---------- - formats : ndarray - Array of tuples with the format and the percentage of strings that - match the format, sorted by the percentage of strings that match the - format. - dayfirst : bool - Should we prefer dayfirst formats - - Returns - ------- - best_format : str - The format among the best formats which respect dayfirst, - if any, otherwise the first best format. - """ - # Find all formats which work for - # the largest number of samples - best_formats = [ - format_found for format_found in formats if format_found[1] == formats[0][1] - ] - # If several formats work as well, prefer the format which - # respect dayfirst - if len(best_formats) > 1: - for format_found in best_formats: - if _check_format_dayfirst(format_found[0]) == dayfirst: - return format_found[0], _check_format_dayfirst(format_found[0]) - return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) + if not len(formats_checked): + return None + else: + # Sort by the number of strings that match the format + formats_checked.sort(key=lambda x: x[1], reverse=True) + best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) + return best_format def _iterative_conversion( arg: np.ndarray, - formats: np.ndarray, + name: str, utc: bool, unit: str | None, errors: DateTimeErrorChoices, @@ -283,10 +306,8 @@ def _iterative_conversion( ---------- arg : ndarray Array of datetime strings. - formats : ndarray - Array of tuples with the format and the percentage of strings that - match the format, sorted by the percentage of strings that match the - format. + name : str + Name of the argument. utc : bool Whether to convert/localize timestamps to UTC. unit : str @@ -303,16 +324,18 @@ def _iterative_conversion( """ # iteratively convert the remaining samples # in "coerce" mode with the ith best format - # until all values are converted or all formats are exhausted # or 10 formats have been tried - best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] - # remove the best format from the list - formats = formats[formats[:, 0] != best_format] + # if we contradict dayfirst, we warn for the first format, but not the rest + best_format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst, warn=True) result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc) indices_succeeded = notna(result) - for _ in range(min(len(formats), 10)): - best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] - formats = formats[formats[:, 0] != best_format] + for _ in range(10): + best_format = _guess_datetime_format_for_array( + arg[~indices_succeeded], dayfirst=dayfirst, warn=False + ) + + if best_format is None: + break results_format, timezones_format = array_strptime( arg[~indices_succeeded], best_format, exact, "coerce", utc ) @@ -339,7 +362,11 @@ def _iterative_conversion( result[~indices_succeeded] = iNaT elif errors == "ignore": result = arg - return result, tz_parsed + + if any(tz is not None for tz in tz_parsed): + return _return_parsed_timezone_results(result, tz_parsed, utc, name) + + return _box_as_indexlike(result, utc=utc, name=name) def should_cache( @@ -635,64 +662,33 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - best_format = None + if format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + + # `format` could be inferred, or user didn't ask for mixed-format parsing. if format is not None and format != "mixed": - best_format = format - else: - # get a list of formats which work for some of the elements - # sorted by the percentage of elements that match, highest first - # It's a list of tuples of (format, percentage of elements that match) - formats = _guess_datetime_format_for_array( - arg, n_find_format=20, n_check_format=250 - ) - if len(formats) == 0: - result, tz_parsed = objects_to_datetime64ns( - arg, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - errors=errors, - allow_object=True, - ) - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) - if format != "mixed" and len(formats) > 0: - # formats[0][1] is the percentage of elements that matched - if errors == "raise" and formats[0][1] != 100: - raise ValueError( - "No datetime format was found which " - "matched all values in the array.\n" - "You might want to try:\n" - " - passing `format` if your strings have a consistent format;\n" - " - passing `format='ISO8601'` if your strings are " - "all ISO8601 but not necessarily in exactly the same format;\n" - " - passing `format='mixed'`, and the format will be " - "inferred for each element individually. " - "You might want to use `dayfirst` alongside this.\n" - f"Best format found: {formats[0][0]} " - "(matched {formats[0][1]}% of the values)" - ) - best_format, best_format_dayfirst = _try_to_repect_dayfirst( - formats, dayfirst - ) - if best_format_dayfirst is not None and best_format_dayfirst != dayfirst: - warnings.warn( - f"Parsing dates in {best_format} format when " - f"dayfirst={dayfirst} was specified. " - f"Pass `dayfirst={not dayfirst}` or specify a format " - "to silence this warning.", - stacklevel=find_stack_level(), - ) - if best_format is not None: - return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors) + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + if format == "mixed": - result, tz_parsed = _iterative_conversion( - arg, formats, utc, unit, errors, dayfirst, yearfirst, exact + return _iterative_conversion( + arg, name, utc, unit, errors, dayfirst, yearfirst, exact ) + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + allow_object=True, + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) + return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f33dc1603d4d5..ed967c9a128b2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2684,7 +2684,7 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @pytest.mark.parametrize( - "test_list, expected_formats", + "test_list, expected_format", [ ( [ @@ -2692,69 +2692,37 @@ class TestGuessDatetimeFormat: "2011-12-30 00:00:00.000000", "2011-12-30 00:00:00.000000", ], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - [np.nan, np.nan, "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["NaT", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["2011-12-30 00:00:00.000000", "random_string"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object), - ), - ( - ["now", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["today", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + "%Y-%m-%d %H:%M:%S.%f", ), + ([np.nan, np.nan, "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["NaT", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["2011-12-30 00:00:00.000000", "random_string"], "%Y-%m-%d %H:%M:%S.%f"), + (["now", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["today", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), ( ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"], - np.array( - [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)], - dtype=object, - ), - ), - ( - ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], - np.array( - [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)], - dtype=object, - ), + "%m-%d-%Y %H:%M:%S.%f", ), + (["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], "%m/%d/%Y"), ], ) - def test_guess_datetime_format_for_array(self, test_list, expected_formats): + def test_guess_datetime_format_for_array(self, test_list, expected_format): test_array = np.array(test_list, dtype=object) res = tools._guess_datetime_format_for_array( - test_array, n_find_format=5, n_check_format=5 + test_array, dayfirst=False, n_find_format=5, n_check_format=5 ) - # sort according to first element of tuple (format string) to ignore order - sorted_index = np.argsort([x[0] for x in res]) - res = res[sorted_index] - sorted_index = np.argsort([x[0] for x in expected_formats]) - expected_formats = expected_formats[sorted_index] - assert (res == expected_formats).all() - # TODO more tests + assert res == expected_format @td.skip_if_not_us_locale def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( np.array([np.nan, np.nan, np.nan], dtype="O"), + dayfirst=False, n_find_format=5, n_check_format=5, ) - assert len(format_for_string_of_nans) == 0 + assert format_for_string_of_nans is None class TestToDatetimeInferFormat: @@ -3658,13 +3626,20 @@ def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst) result = to_datetime( date_str, errors=errors, dayfirst=try_dayfirst ) + # should also work for format="mixed" + result_mixed = to_datetime( + date_str, + errors=errors, + dayfirst=try_dayfirst, + format="mixed", + ) else: result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst) + result_mixed = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst, format="mixed" + ) tm.assert_index_equal(result, expected) - - # should also work with format="mixed" - result = to_datetime(date_str, format="mixed") - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result_mixed, expected) # ambiguous dates @pytest.mark.parametrize( @@ -3719,12 +3694,16 @@ def test_multiple_dates_ambiguous_error(self, date_str): # should raise an error with "raise" with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match="""time data "random_string" doesn't match format "%d-%m-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", ): to_datetime(date_str, errors="raise", dayfirst=True) with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match="""time data "random_string" doesn't match format "%m-%d-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", ): to_datetime(date_str, errors="raise", dayfirst=False) @@ -3807,12 +3786,18 @@ def test_multiple_dates_ambiguous_error(self, date_str): def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # no format works for all dates # raise should raise an error + msg = r'^time data ".*" doesn\'t match format ".*", at position .*' with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match=msg, ): - to_datetime(date_str, errors="raise") - + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + # FIXME: do we need to raise a warning here? + to_datetime(date_str, errors="raise") + else: + to_datetime(date_str, errors="raise") # coerce and ignore should choose the format # which works for the most dates (the first one) for errors in ["coerce", "ignore"]: @@ -3827,5 +3812,15 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # if format="mixed", the conversion should be done from the best format # to the worst format - result = to_datetime(date_str, format="mixed") - tm.assert_index_equal(result, expected_mixed) + for errors in ["raise", "coerce", "ignore"]: + if expected_formats[0] == "%d-%m-%Y": + # we raise a warning if the best format used + # (the one which works for the most dates) + # contradict the default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, format="mixed", errors=errors) + else: + # we don't raise a warning if other formats used + # contradict dayfirst + result = to_datetime(date_str, format="mixed", errors=errors) + tm.assert_index_equal(result, expected_mixed) From 6b371ca6318a0a6d473e648560d72a5b22f96e56 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 13 Apr 2023 13:48:55 +0200 Subject: [PATCH 08/37] remove randomness --- pandas/_libs/tslib.pyx | 10 ++++++---- pandas/core/tools/datetimes.py | 14 ++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9b2ca61e12c29..fb96c9d6115c7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -401,8 +401,8 @@ def first_non_null(values: ndarray) -> int: @cython.wraparound(False) @cython.boundscheck(False) -def random_non_null(values: ndarray, int n) -> ndarray: - """Find n non-null values selected at random, return an array of indices.""" +def evenly_spaced_non_null(values: ndarray, int n) -> ndarray: + """Find n evenly spaced non-null values, return an array of indices.""" cdef: Py_ssize_t total = len(values) Py_ssize_t i, non_null_count @@ -422,8 +422,10 @@ def random_non_null(values: ndarray, int n) -> ndarray: non_null_count = len(non_null_indices) if non_null_count == 0 or n <= 0: return np.empty(0, dtype=np.int64) - # use np.random.choice - return np.random.choice(non_null_indices, min(n, non_null_count), replace=False) + evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1, + min(len(non_null_indices), n), + dtype=int) + return np.array(non_null_indices)[evenly_spaced_indices] @cython.wraparound(False) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 61ac201fc3662..05046e9850f98 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -227,15 +227,13 @@ def _guess_datetime_format_for_array( match the format, sorted by the percentage of strings that match the format. """ - # Extract a random sample of datetime strings - assert ( - n_find_format <= n_check_format - ), "n_check_format must be greater than n_find_format" - sample_idx = tslib.random_non_null(arr, n_check_format) - sample_check = arr[sample_idx] - sample_find = sample_check[:n_find_format] - if len(sample_idx) == 0: + # Extract a sample of datetime strings + idx_find = tslib.evenly_spaced_non_null(arr, n_find_format) + if len(idx_find) == 0: return None + idx_check = tslib.evenly_spaced_non_null(arr, n_check_format) + sample_check = arr[idx_check] + sample_find = arr[idx_find] formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format From 705d1b4daca4a1469d32e14a2ac94da54a3e6ef9 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 10:08:45 +0200 Subject: [PATCH 09/37] fix parser tests --- pandas/core/tools/datetimes.py | 5 +++-- pandas/tests/io/parser/test_parse_dates.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 05046e9850f98..105d8c4582d05 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -189,9 +189,10 @@ def _try_to_repect_dayfirst( and _check_format_dayfirst(best_formats[0][0]) is not None and _check_format_dayfirst(best_formats[0][0]) != dayfirst ): + default_string = " (the default)" if not dayfirst else "" warnings.warn( f"Parsing dates in {best_formats[0][0]} format when " - f"dayfirst={dayfirst} was specified. " + f"dayfirst={dayfirst}{default_string} was specified. " f"Pass `dayfirst={not dayfirst}` or specify a format " "to silence this warning.", stacklevel=find_stack_level(), @@ -252,7 +253,7 @@ def _guess_datetime_format_for_array( # remove YDM as it does not exist # but is returned by guess_datetime_format for format_ in list(formats_found): - if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): + if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_): # doesn't exist but is returned by guess_datetime_format formats_found.remove(format_) # Try to apply the formats found diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8c3474220cde8..826bee7ba4dbd 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1806,11 +1806,13 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - with pytest.raises( - ValueError, + with tm.assert_produces_warning( + UserWarning, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" + "Parsing dates in %d/%m/%Y format when " + "dayfirst=False \\(the default\\) was specified. " + "Pass `dayfirst=True` or specify a format " + "to silence this warning." ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) @@ -2020,10 +2022,9 @@ def test_dayfirst_warnings(): tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res6 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index + res6 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index tm.assert_index_equal(expected, res6) From 0bae15d44f31cb1c0b30c836d0511d30b6bfb5ba Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:18:41 +0200 Subject: [PATCH 10/37] simplify getting evenly spaced non null --- pandas/_libs/tslib.pyi | 1 - pandas/_libs/tslib.pyx | 32 +------------------------------- pandas/core/tools/datetimes.py | 17 ++++++++++++----- 3 files changed, 13 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index bd8748cd2650a..9819b5173db56 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -17,7 +17,6 @@ def array_with_unit_to_datetime( errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... -def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ... def array_to_datetime( values: npt.NDArray[np.object_], errors: str = ..., diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fb96c9d6115c7..106f203a16855 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -75,8 +75,7 @@ from pandas._libs.tslibs import ( ) from pandas._libs.tslibs.timestamps import Timestamp -from libc.stdlib cimport srand -from libc.time cimport time +# Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single @@ -399,35 +398,6 @@ def first_non_null(values: ndarray) -> int: return -1 -@cython.wraparound(False) -@cython.boundscheck(False) -def evenly_spaced_non_null(values: ndarray, int n) -> ndarray: - """Find n evenly spaced non-null values, return an array of indices.""" - cdef: - Py_ssize_t total = len(values) - Py_ssize_t i, non_null_count - list non_null_indices = [] - srand(time(NULL)) - for i in range(total): - val = values[i] - if checknull_with_nat_and_na(val): - continue - if ( - isinstance(val, str) - and - (len(val) == 0 or val in nat_strings or val in ("now", "today")) - ): - continue - non_null_indices.append(i) - non_null_count = len(non_null_indices) - if non_null_count == 0 or n <= 0: - return np.empty(0, dtype=np.int64) - evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1, - min(len(non_null_indices), n), - dtype=int) - return np.array(non_null_indices)[evenly_spaced_indices] - - @cython.wraparound(False) @cython.boundscheck(False) cpdef array_to_datetime( diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 105d8c4582d05..32a1cbaab5a9d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -229,12 +229,19 @@ def _guess_datetime_format_for_array( format. """ # Extract a sample of datetime strings - idx_find = tslib.evenly_spaced_non_null(arr, n_find_format) - if len(idx_find) == 0: + # ignore missing + arr_non_null = arr[notna(arr)] + arr_non_null = arr_non_null[ + ~np.isin(arr_non_null, ["", "now", "today"] + list(nat_strings)) + ] + if len(arr_non_null) == 0: return None - idx_check = tslib.evenly_spaced_non_null(arr, n_check_format) - sample_check = arr[idx_check] - sample_find = arr[idx_find] + # get evenly spaced non-null indices + step_find = max(len(arr_non_null) // n_find_format, 1) + step_check = max(len(arr_non_null) // n_check_format, 1) + sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] + sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] + # try formats formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format From de7331f5dcbafba582d02855a0a36796759d1265 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:32:02 +0200 Subject: [PATCH 11/37] update io readme --- doc/source/user_guide/io.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c33d4ab92d4c6..7522fecfed3fb 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -977,11 +977,10 @@ Note that format inference is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. -If you try to parse a column of date strings, pandas will attempt to guess the format -from the first non-NaN element, and will then parse the rest of the column with that -format. If pandas fails to guess the format (for example if your first string is -``'01 December US/Pacific 2000'``), then a warning will be raised and each -row will be parsed individually by ``dateutil.parser.parse``. The safest +If you try to parse a column of date strings, pandas will attempt to find the format +which work best from a sample of non-NaN elements, and will then parse the rest of the +column with that format. If pandas fails to guess the format, then a warning will be +raised and each row will be parsed individually by ``dateutil.parser.parse``. The safest way to parse dates is to explicitly set ``format=``. .. ipython:: python @@ -994,7 +993,9 @@ way to parse dates is to explicitly set ``format=``. df In the case that you have mixed datetime formats within the same column, you can -pass ``format='mixed'`` +pass ``format='mixed'``. Pandas will convert rows to the best format found (the one +which matches the most rows), and then iteratively convert the remaining rows with the +remaining formats. .. ipython:: python From 9136b4f57fd50d2cc3e5fd0d3f7844cfff2aed97 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:41:33 +0200 Subject: [PATCH 12/37] revert changed tests --- pandas/tests/tools/test_to_datetime.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ed967c9a128b2..016e30e6dfda9 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1313,7 +1313,10 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=(f"{PARSING_ERR_MSG}"), + match=( + r'^time data "True" doesn\'t match format "%Y%m%d", ' + f"at position 1. {PARSING_ERR_MSG}$" + ), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -2401,7 +2404,10 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = rf"{PARSING_ERR_MSG}" + msg = ( + r'^time data " " doesn\'t match format "%m/%d/%Y", ' + rf"at position 2. {PARSING_ERR_MSG}$" + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2746,7 +2752,10 @@ def test_to_datetime_infer_datetime_format_consistent_format( def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) - msg = f"{PARSING_ERR_MSG}" + msg = ( + r'^time data "01-02-2011 00:00:00" doesn\'t match format ' + rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) From 9f966d56d22d5f890d40c6dfecd201f3cef56e90 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 12:51:40 +0200 Subject: [PATCH 13/37] fix type hints --- doc/source/whatsnew/v0.19.0.rst | 2 +- pandas/core/tools/datetimes.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index ab17cacd830e5..c300fc7f286db 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -765,7 +765,7 @@ Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python - + :okwarning: pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 32a1cbaab5a9d..d33af6482d492 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -150,7 +150,7 @@ def _check_format_dayfirst(format_string: str) -> bool | None: def _try_to_repect_dayfirst( - formats: np.ndarray, + formats: list, dayfirst: bool | None, warn: bool, ) -> str: @@ -242,7 +242,7 @@ def _guess_datetime_format_for_array( sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] # try formats - formats_found = set() + formats_found = [] for datetime_string in sample_find: # catch warnings from guess_datetime_format # which appears when dayfirst is contradicted @@ -253,14 +253,17 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - formats_found.add( + formats_found.append( guess_datetime_format(datetime_string, dayfirst=False) ) - formats_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + formats_found.append( + guess_datetime_format(datetime_string, dayfirst=True) + ) + formats_found = [format_ for format_ in formats_found if format_ is not None] # remove YDM as it does not exist # but is returned by guess_datetime_format - for format_ in list(formats_found): - if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_): + for format_ in np.unique(formats_found): + if re.match(r".*%Y.*%d.*%m.*", format_): # doesn't exist but is returned by guess_datetime_format formats_found.remove(format_) # Try to apply the formats found @@ -284,25 +287,27 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) + print(formats_checked) if not len(formats_checked): return None else: # Sort by the number of strings that match the format formats_checked.sort(key=lambda x: x[1], reverse=True) best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) + print(best_format) return best_format def _iterative_conversion( arg: np.ndarray, - name: str, + name: Hashable, utc: bool, unit: str | None, errors: DateTimeErrorChoices, dayfirst: bool | None, yearfirst: bool | None, exact: bool, -) -> tuple[np.ndarray, np.ndarray]: +) -> Index: """ For mixed format, convert datetimestrings iteratively, from the best format (the format which work for most samples) @@ -313,7 +318,7 @@ def _iterative_conversion( arg : ndarray Array of datetime strings. name : str - Name of the argument. + None or string for the Index name utc : bool Whether to convert/localize timestamps to UTC. unit : str @@ -538,7 +543,7 @@ def _convert_and_box_cache( def _return_parsed_timezone_results( - result: np.ndarray, timezones, utc: bool, name: str + result: np.ndarray, timezones, utc: bool, name: Hashable ) -> Index: """ Return results from array_strptime if a %z or %Z directive was passed. @@ -996,6 +1001,7 @@ def to_datetime( - "mixed", to allow for multiple formats. Values will be parsed iteratively using the most promising format at each step. This is risky, and you should probably use it along with `dayfirst`. + exact : bool, default True Control how `format` is used: From 7ca7244ce9036d3f61177be1c661d68d022708c9 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 14:42:50 +0200 Subject: [PATCH 14/37] fix type hints for np.unique --- pandas/core/tools/datetimes.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d33af6482d492..ad9419f4bcd6d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -253,13 +253,12 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - formats_found.append( - guess_datetime_format(datetime_string, dayfirst=False) - ) - formats_found.append( - guess_datetime_format(datetime_string, dayfirst=True) - ) - formats_found = [format_ for format_ in formats_found if format_ is not None] + for try_dayfirst in [False, True]: + format_found = guess_datetime_format( + datetime_string, dayfirst=try_dayfirst + ) + if format_found is not None: + formats_found.append(format_found) # remove YDM as it does not exist # but is returned by guess_datetime_format for format_ in np.unique(formats_found): From 4b81192e258a4f97b1953b25ed50b971b2425114 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 15:24:32 +0200 Subject: [PATCH 15/37] remove prints --- pandas/core/tools/datetimes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ad9419f4bcd6d..cd58abace1f72 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -286,14 +286,12 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) - print(formats_checked) if not len(formats_checked): return None else: # Sort by the number of strings that match the format formats_checked.sort(key=lambda x: x[1], reverse=True) best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) - print(best_format) return best_format From 001a270f22aae9383a2c931d0be6c0180d2b1154 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 17:12:59 +0200 Subject: [PATCH 16/37] fix doc --- doc/source/whatsnew/v0.19.0.rst | 1 + pandas/core/tools/datetimes.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index c300fc7f286db..bd8b5baa5b701 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -766,6 +766,7 @@ This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python :okwarning: + pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index cd58abace1f72..92ebed261b529 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -214,16 +214,18 @@ def _guess_datetime_format_for_array( ---------- arr : ndarray Array of datetime strings. + dayfirst : bool + dayfirst parsing behavior from to_datetime. n_find_format : int Number of strings to use to guess the format. n_check_format : int Number of strings to check for each format found. - warn: bool - Whether to warn if we contradict dayfirst + warn : bool + Whether to warn if we contradict dayfirst. Returns ------- - formats : ndarray + ndarray Array of tuples with the format and the percentage of strings that match the format, sorted by the percentage of strings that match the format. @@ -1182,8 +1184,8 @@ def to_datetime( If multiple datetime formats are possible for a value, pandas will try to infer the most plausible format using the other examples. - >>> pd.to_datetime(["01-02-2012", "30-01-2012"]) - DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + >>> pd.to_datetime(["01-02-2012", "02-30-2012"]) + DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: From fe99f83857ff1d102f6281670881cf9ec33dab07 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 17:58:08 +0200 Subject: [PATCH 17/37] fix example with febuary 30th --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 92ebed261b529..ac2846d826bc7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1184,8 +1184,8 @@ def to_datetime( If multiple datetime formats are possible for a value, pandas will try to infer the most plausible format using the other examples. - >>> pd.to_datetime(["01-02-2012", "02-30-2012"]) - DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None) + >>> pd.to_datetime(["01-02-2012", "02-27-2012"]) + DatetimeIndex(['2012-01-02', '2012-02-27'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: From 8de90e4b1d02d0a2fcd13bcd69f4a8d31620f523 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 20:21:28 +0200 Subject: [PATCH 18/37] fix doc --- pandas/core/tools/datetimes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ac2846d826bc7..795fee0604cf0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -997,9 +997,10 @@ def to_datetime( - "ISO8601", to parse any `ISO8601 `_ time string (not necessarily in exactly the same format); + - "mixed", to allow for multiple formats. Values will be parsed iteratively - using the most promising format at each step. This is risky, - and you should probably use it along with `dayfirst`. + using the most promising format at each step. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: From 281d45bc355dda8e5fbfe3c4d46563c745d519cc Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 01:52:18 +0200 Subject: [PATCH 19/37] All tests pass --- pandas/_libs/tslib.pyx | 34 ++- pandas/core/tools/datetimes.py | 311 +++++++++++++++++++++---- pandas/tests/tools/test_to_datetime.py | 309 +++++++++++++++++++++--- 3 files changed, 575 insertions(+), 79 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 106f203a16855..8b790e3bd8adc 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -69,17 +69,22 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp +import cython + from pandas._libs.tslibs import ( Resolution, get_resolution, ) from pandas._libs.tslibs.timestamps import Timestamp -# Note: this is the only non-tslibs intra-pandas dependency here +from libc.stdlib cimport srand +from libc.time cimport time from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single +# Note: this is the only non-tslibs intra-pandas dependency here + def _test_parse_iso8601(ts: str): """ @@ -398,6 +403,33 @@ def first_non_null(values: ndarray) -> int: return -1 +@cython.wraparound(False) +@cython.boundscheck(False) +def random_non_null(values: ndarray, int n) -> ndarray: + """Find n non-null values selected at random, return an array of indices.""" + cdef: + Py_ssize_t total = len(values) + Py_ssize_t i, non_null_count + list non_null_indices = [] + srand(time(NULL)) + for i in range(total): + val = values[i] + if checknull_with_nat_and_na(val): + continue + if ( + isinstance(val, str) + and + (len(val) == 0 or val in nat_strings or val in ("now", "today")) + ): + continue + non_null_indices.append(i) + non_null_count = len(non_null_indices) + if non_null_count == 0 or n <= 0: + return np.empty(0, dtype=np.int64) + # use np.random.choice + return np.random.choice(non_null_indices, min(n, non_null_count), replace=False) + + @cython.wraparound(False) @cython.boundscheck(False) cpdef array_to_datetime( diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3e1b6070ffc39..0d7e3f4999787 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -4,6 +4,7 @@ from datetime import datetime from functools import partial from itertools import islice +import re from typing import ( TYPE_CHECKING, Callable, @@ -128,27 +129,207 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: - # Try to guess the format based on the first non-NaN element, return None if can't - if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: - # GH#32264 np.str_ object - guessed_format = guess_datetime_format( - first_non_nan_element, dayfirst=dayfirst +def _check_format_dayfirst(format_string): + dayfirst = False + for char in ["%d", "%m", "%Y"]: + if char not in format_string: + return None + + if format_string.index("%d") < format_string.index("%m") and format_string.index( + "%m" + ) < format_string.index("%Y"): + dayfirst = True + elif format_string.index("%m") < format_string.index("%d") and format_string.index( + "%d" + ) < format_string.index("%Y"): + dayfirst = False + else: + dayfirst = None + + return dayfirst + + +def _guess_datetime_format_for_array( + arr, n_find_format, n_check_format +) -> ArrayLike[tuple[str, str]]: + """ + Guess the format of the datetime strings in an array. + + Parameters + ---------- + arr : ndarray + Array of datetime strings. + n_find_format : int + Number of strings to use to guess the format. + n_check_format : int + Number of strings to check for each format found. + + Returns + ------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + """ + # Extract a random sample of datetime strings + assert ( + n_find_format <= n_check_format + ), "n_check_format must be greater than n_find_format" + sample_idx = tslib.random_non_null(arr, n_check_format) + sample_check = arr[sample_idx] + sample_find = sample_check[:n_find_format] + if len(sample_idx) == 0: + return [] # FIXME + format_found = set() + for datetime_string in sample_find: + # catch warnings from guess_datetime_format + # which appears when dayfirst is contradicted + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Parsing dates in .* format when dayfirst=.* was specified.", ) - if guessed_format is not None: - return guessed_format - # If there are multiple non-null elements, warn about - # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) - return None + if type(datetime_string) is str: + format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) + format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + if None in format_found: + format_found.remove(None) + # remove YDM as it does not exist + # but is returned by guess_datetime_format + for format in list(format_found): + if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format): + # doesn't exist but is returned by guess_datetime_format + # FIXME + format_found.remove(format) + # Try to apply the formats found + # to a larger sample + format_checked = [] + for format in format_found: + converted = array_strptime(sample_check, fmt=format, errors="coerce")[0] + format_checked.append( + (format, int(100 * np.sum(~np.isnan(converted)) / len(converted))) + ) + # Sort by the number of strings that match the format + format_checked.sort(key=lambda x: x[1], reverse=True) + if ( + len(format_checked) == 0 + and len(sample_check) > 1 + and np.any([type(e) is str for e in sample_find]) + # GH#32264 np.str_ objects + ): + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually, falling back to `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) + return np.array(format_checked, dtype=object) + + +def _try_to_repect_dayfirst(formats, dayfirst): + """ + If several formats work as well, prefer the format which + respect dayfirst. + + Parameters + ---------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + dayfirst : bool + Should we prefer dayfirst formats + + Returns + ------- + best_format : str + The format among the best formats which respect dayfirst, + if any, otherwise the first best format. + """ + # Find all formats which work for + # the largest number of samples + best_formats = [ + format_found for format_found in formats if format_found[1] == formats[0][1] + ] + # If several formats work as well, prefer the format which + # respect dayfirst + if len(best_formats) > 1: + for format_found in best_formats: + if _check_format_dayfirst(format_found[0]) == dayfirst: + return format_found[0], _check_format_dayfirst(format_found[0]) + return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) + + +def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact): + """ + For mixed format, convert datetimestrings iteratively, + from the best format (the format which work for most samples) + to the worst. + + Parameters + ---------- + arg : ndarray + Array of datetime strings. + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + utc : bool + Whether to convert/localize timestamps to UTC. + unit : str + None or string of the frequency of the passed data + errors : str + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + dayfirst : bool + dayfirst parsing behavior from to_datetime + yearfirst : bool + yearfirst parsing behavior from to_datetime + exact : bool, default True + exact format matching behavior from to_datetime + + """ + # iteratively convert the remaining samples + # in "coerce" mode with the ith best format + # until all values are converted or all formats are exhausted + # or 10 formats have been tried + best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] + # remove the best format from the list + formats = formats[formats[:, 0] != best_format] + result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc) + indices_succeeded = notna(result) + for _ in range(min(len(formats), 10)): + best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] + formats = formats[formats[:, 0] != best_format] + results_format, timezones_format = array_strptime( + arg[~indices_succeeded], best_format, exact, "coerce", utc + ) + indices_succeeded_small = notna(results_format) + update_indices = np.arange(len(result))[~indices_succeeded][ + indices_succeeded_small + ] + result[update_indices] = results_format[indices_succeeded_small] + tz_parsed[~indices_succeeded][indices_succeeded_small] = timezones_format[ + indices_succeeded_small + ] + indices_succeeded[~indices_succeeded] = indices_succeeded_small + if indices_succeeded.all(): + break + if not indices_succeeded.all(): + # if we exhausted all formats and still have missing values + if errors == "raise": + raise ValueError( + f"""Unable to parse "{arg[~indices_succeeded][0]}" as a date. + You can pass `errors="coerce"` or `errors="ignore"` to + ignore this error.""" + ) + elif errors == "coerce": + result[~indices_succeeded] = iNaT + elif errors == "ignore": + # TODO check + result = arg + return result, tz_parsed def should_cache( @@ -444,27 +625,64 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - if format is None: - format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - - # `format` could be inferred, or user didn't ask for mixed-format parsing. + # get the list of formats which work for some of the elements + # sorted by the percentage of elements that match, highest first + # It's a list of tuples of (format, percentage of elements that match) + best_format = None if format is not None and format != "mixed": - return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) - - result, tz_parsed = objects_to_datetime64ns( - arg, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - errors=errors, - allow_object=True, - ) - - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) + best_format = format + else: + # guess the format + formats = _guess_datetime_format_for_array( + arg, n_find_format=20, n_check_format=250 + ) + if len(formats) == 0: + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + allow_object=True, + ) + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) + if format != "mixed" and len(formats) > 0: + # formats[0][1] is the percentage of elements that matched + if errors == "raise" and formats[0][1] != 100: + raise ValueError( + "No datetime format was found which " + "matched all values in the array.\n" + "You might want to try:\n" + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be " + "inferred for each element individually. " + "You might want to use `dayfirst` alongside this.\n" + f"Best format found: {formats[0][0]} " + "(matched {formats[0][1]}% of the values)" + ) + best_format, best_format_dayfirst = _try_to_repect_dayfirst( + formats, dayfirst + ) + if best_format_dayfirst is not None and best_format_dayfirst != dayfirst: + warnings.warn( + f"Parsing dates in {best_format} format when " + f"dayfirst={dayfirst} was specified. " + f"Pass `dayfirst={not dayfirst}` or specify a format " + "to silence this warning.", + stacklevel=find_stack_level(), + ) + if best_format is not None: + return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors) + if format == "mixed": + result, tz_parsed = _iterative_conversion( + arg, formats, utc, unit, errors, dayfirst, yearfirst, exact + ) return _box_as_indexlike(result, utc=utc, name=name) @@ -764,8 +982,9 @@ def to_datetime( - "ISO8601", to parse any `ISO8601 `_ time string (not necessarily in exactly the same format); - - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + - "mixed", to allow for multiple formats. Values will be parsed iteratively + using the most promising format at each step. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: @@ -944,6 +1163,14 @@ def to_datetime( >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT + **Ambiguous format** + + If multiple datetime formats are possible for a value, pandas will try to infer + the most plausible format using the other examples. + + >>> pd.to_datetime(["01-02-2012", "30-01-2012"]) + DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + .. _to_datetime_tz_examples: **Timezones and time offsets** diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 0b5696116e610..bf7e1f8ffc03d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1302,10 +1302,7 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=( - r'^time data "True" doesn\'t match format "%Y%m%d", ' - f"at position 1. {PARSING_ERR_MSG}$" - ), + match=(f"{PARSING_ERR_MSG}"), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -1652,11 +1649,11 @@ def test_mixed_offsets_with_native_datetime_raises(self): mixed = to_datetime(ser) expected = Series( [ - "NaT", + NaT, Timestamp("1990-01-01"), Timestamp("2015-03-14T16:15:14.123-08:00").to_pydatetime(), Timestamp("2019-03-04T21:56:32.620-07:00").to_pydatetime(), - None, + NaT, # TODO check ], dtype=object, ) @@ -1855,7 +1852,7 @@ def test_unit_with_numeric(self, cache, errors, dtype): [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], - None, + UserWarning, ], ], ) @@ -2393,10 +2390,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = ( - r'^time data " " doesn\'t match format "%m/%d/%Y", ' - rf"at position 2. {PARSING_ERR_MSG}$" - ) + msg = rf"{PARSING_ERR_MSG}" with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2467,6 +2461,7 @@ def test_to_datetime_strings_vs_constructor(self, result): expected = Timestamp(2012, 1, 1) assert result == expected + @pytest.mark.filterwarnings("ignore:Could not infer format") def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 @@ -2659,10 +2654,7 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, - match=( - r'^time data "03/30/2011" doesn\'t match format ' - rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' - ), + match=(rf"{PARSING_ERR_MSG}"), ): to_datetime(arr, dayfirst=True) @@ -2681,32 +2673,77 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @pytest.mark.parametrize( - "test_list", + "test_list, expected_formats", [ - [ - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - "2011-12-30 00:00:00.000000", - ], - [np.nan, np.nan, "2011-12-30 00:00:00.000000"], - ["", "2011-12-30 00:00:00.000000"], - ["NaT", "2011-12-30 00:00:00.000000"], - ["2011-12-30 00:00:00.000000", "random_string"], - ["now", "2011-12-30 00:00:00.000000"], - ["today", "2011-12-30 00:00:00.000000"], + ( + [ + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + "2011-12-30 00:00:00.000000", + ], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + [np.nan, np.nan, "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["NaT", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["2011-12-30 00:00:00.000000", "random_string"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object), + ), + ( + ["now", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["today", "2011-12-30 00:00:00.000000"], + np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + ), + ( + ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"], + np.array( + [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)], + dtype=object, + ), + ), + ( + ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], + np.array( + [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)], + dtype=object, + ), + ), ], ) - def test_guess_datetime_format_for_array(self, test_list): - expected_format = "%Y-%m-%d %H:%M:%S.%f" + def test_guess_datetime_format_for_array(self, test_list, expected_formats): test_array = np.array(test_list, dtype=object) - assert tools._guess_datetime_format_for_array(test_array) == expected_format + res = tools._guess_datetime_format_for_array( + test_array, n_find_format=5, n_check_format=5 + ) + # sort according to first element of tuple (format string) to ignore order + sorted_index = np.argsort([x[0] for x in res]) + res = res[sorted_index] + sorted_index = np.argsort([x[0] for x in expected_formats]) + expected_formats = expected_formats[sorted_index] + assert (res == expected_formats).all() + # TODO more tests @td.skip_if_not_us_locale def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array([np.nan, np.nan, np.nan], dtype="O") + np.array([np.nan, np.nan, np.nan], dtype="O"), + n_find_format=5, + n_check_format=5, ) - assert format_for_string_of_nans is None + assert len(format_for_string_of_nans) == 0 class TestToDatetimeInferFormat: @@ -2730,10 +2767,7 @@ def test_to_datetime_infer_datetime_format_consistent_format( def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) - msg = ( - r'^time data "01-02-2011 00:00:00" doesn\'t match format ' - rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' - ) + msg = f"{PARSING_ERR_MSG}" with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -3584,3 +3618,206 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): result = to_datetime(ser) expected = Series([1, 2], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) + + +class TestParsingMultipleDates: + # TODO handle yearfirst + @pytest.mark.parametrize( + "date_str, expected_format, dayfirst", + [ + (["2010-01-01", "2010-02-02", "2010-01-03"], "%Y-%m-%d", None), + (["2010-01-01", "2010-02-13", "2010-01-03"], "%Y-%m-%d", None), + (["01-01-2012", "01-13-2012", "01-03-2010"], "%m-%d-%Y", False), + (["01-01-2012", "13-01-2012", "01-03-2010"], "%d-%m-%Y", True), + ], + ) + def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst): + # only one format is possible + expected = to_datetime(date_str, format=expected_format) + + # all errors should prefer the format + # which works for all dates + for errors in ["raise", "coerce", "ignore"]: + for try_dayfirst in [True, False]: + # warn if we contradict dayfirst + # we don't warn when format is "%Y-%m-%d" + # TODO same for yearfirst + if dayfirst is not None and try_dayfirst != dayfirst: + with tm.assert_produces_warning(UserWarning): + result = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst + ) + else: + result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst) + tm.assert_index_equal(result, expected) + + # should also work with format="mixed" + result = to_datetime(date_str, format="mixed") + tm.assert_index_equal(result, expected) + + # ambiguous dates + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y") + expected_not_dayfirst = to_datetime(date_str, format="%m-%d-%Y") + + for errors in ["raise", "coerce", "ignore"]: + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # # should also work with format="mixed" + result_dayfirst = to_datetime(date_str, format="mixed", dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, format="mixed", dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # ambiguous dates with errors + @pytest.mark.parametrize( + "date_str", + [ + (["01-01-2012", "01-05-2012", "random_string", "01-03-2010"]), + (["01-01-2012", "05-01-2012", "random_string", "01-03-2010"]), + ], + ) + def test_multiple_dates_ambiguous_error(self, date_str): + # multiple formats work for all dates + # we should respect the dayfirst argument + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime(date_str, errors=errors, dayfirst=True) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime(date_str, errors=errors, dayfirst=False) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise", dayfirst=True) + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise", dayfirst=False) + + # same with mixed + for errors in ["coerce", "ignore"]: + expected_dayfirst = to_datetime(date_str, format="%d-%m-%Y", errors=errors) + expected_not_dayfirst = to_datetime( + date_str, format="%m-%d-%Y", errors=errors + ) + result_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=True, format="mixed" + ) + tm.assert_index_equal(result_dayfirst, expected_dayfirst) + + result_not_dayfirst = to_datetime( + date_str, errors=errors, dayfirst=False, format="mixed" + ) + tm.assert_index_equal(result_not_dayfirst, expected_not_dayfirst) + + # should raise an error with "raise" + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=True, format="mixed") + with pytest.raises( + ValueError, match="""Unable to parse "random_string" as a date""" + ): + to_datetime(date_str, errors="raise", dayfirst=False, format="mixed") + + # mixed formats + @pytest.mark.parametrize( + "date_str, expected_formats, expected_mixed", + [ + ( + [ + "01-02-2012", + "13-05-2012", + "14-03-2010", + "03-13-2012", + "15-05-2012", + "03-13-2010", + ], + ["%d-%m-%Y", "%m-%d-%Y"], + DatetimeIndex( + [ + "2012-02-01", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ( + [ + "01-02-2012", + "05-13-2012", + "03-14-2010", + "13-03-2012", + "05-15-2012", + "13-03-2010", + ], + ["%m-%d-%Y", "%d-%m-%Y"], + DatetimeIndex( + [ + "2012-01-02", + "2012-05-13", + "2010-03-14", + "2012-03-13", + "2012-05-15", + "2010-03-13", + ], + dtype="datetime64[ns]", + ), + ), + ], + ) + def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): + # no format works for all dates + # raise should raise an error + with pytest.raises( + ValueError, + match="No datetime format was found which matched all values in the array", + ): + to_datetime(date_str, errors="raise") + + # coerce and ignore should choose the format + # which works for the most dates (the first one) + for errors in ["coerce", "ignore"]: + expected = to_datetime(date_str, format=expected_formats[0], errors=errors) + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, errors=errors) + else: + result = to_datetime(date_str, errors=errors) + tm.assert_index_equal(result, expected) + + # if format="mixed", the conversion should be done from the best format + # to the worst format + result = to_datetime(date_str, format="mixed") + tm.assert_index_equal(result, expected_mixed) + + # TODO multiple precision + # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None) From 51d9d98da0a80c72eb2190609611543541d81c8e Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 14:26:35 +0200 Subject: [PATCH 20/37] Update changelog --- doc/source/whatsnew/v2.1.0.rst | 48 +++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b0e9fa2cea0ee..f60ce987552da 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -16,7 +16,53 @@ Enhancements .. _whatsnew_210.enhancements.enhancement1: -enhancement1 +``pd.to_datetime`` now tries to infer the datetime format of each string by considering +the whole Series, and tries to find the format which work for most strings. If several +formats work as well, the one which matches the ``dayfirst`` parameter is returned. If +``format="mixed"``, pandas does the same thing, then tries the second best format on the +strings which failed to parse with the first best format, and so on (:issue:`52508`). + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + ValueError: time data "30-01-2012" doesn't match format "%m-%d-%Y", at position 2. You might want to try: + - passing `format` if your strings have a consistent format; + - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format; + - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this. + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + DatetimeIndex(['2012-01-02', '2012-01-03', 'NaT'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + +*New behavior*: + +.. code-block:: ipython + + In [1]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"]) + Out[1]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. + Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', + freq=None) + + In [2]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], errors="coerce") + Out[2]: + UserWarning: Parsing dates in %d-%m-%Y format when dayfirst=False was specified. Pass `dayfirst=True` or specify a format to silence this warning. + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + In [3]: pd.to_datetime(["01-02-2012", "01-03-2012", "30-01-2012"], format="mixed") + Out[3]: + DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + + ^^^^^^^^^^^^ .. _whatsnew_210.enhancements.enhancement2: From 1d7df6eeb2966f68d8cd7b33f211e5bde7be9b27 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 14:49:47 +0200 Subject: [PATCH 21/37] Add missing type hints --- pandas/_libs/tslib.pyi | 1 + pandas/core/tools/datetimes.py | 43 ++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 9819b5173db56..bd8748cd2650a 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -17,6 +17,7 @@ def array_with_unit_to_datetime( errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... +def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ... def array_to_datetime( values: npt.NDArray[np.object_], errors: str = ..., diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0d7e3f4999787..d0b576c4c0a8f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -129,8 +129,7 @@ class FulldatetimeDict(YearMonthDayDict, total=False): # --------------------------------------------------------------------- -def _check_format_dayfirst(format_string): - dayfirst = False +def _check_format_dayfirst(format_string: str) -> bool | None: for char in ["%d", "%m", "%Y"]: if char not in format_string: return None @@ -150,8 +149,10 @@ def _check_format_dayfirst(format_string): def _guess_datetime_format_for_array( - arr, n_find_format, n_check_format -) -> ArrayLike[tuple[str, str]]: + arr: np.ndarray, + n_find_format: int, + n_check_format: int, +) -> np.ndarray: """ Guess the format of the datetime strings in an array. @@ -179,7 +180,7 @@ def _guess_datetime_format_for_array( sample_check = arr[sample_idx] sample_find = sample_check[:n_find_format] if len(sample_idx) == 0: - return [] # FIXME + return np.array([], dtype=object) format_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format @@ -193,22 +194,19 @@ def _guess_datetime_format_for_array( if type(datetime_string) is str: format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) - if None in format_found: - format_found.remove(None) # remove YDM as it does not exist # but is returned by guess_datetime_format - for format in list(format_found): - if re.match(r"%Y[-/_.]+%d[-/_.]+%m", format): + for format_ in list(format_found): + if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): # doesn't exist but is returned by guess_datetime_format - # FIXME - format_found.remove(format) + format_found.remove(format_) # Try to apply the formats found # to a larger sample format_checked = [] - for format in format_found: - converted = array_strptime(sample_check, fmt=format, errors="coerce")[0] + for format_ in format_found: + converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0] format_checked.append( - (format, int(100 * np.sum(~np.isnan(converted)) / len(converted))) + (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) ) # Sort by the number of strings that match the format format_checked.sort(key=lambda x: x[1], reverse=True) @@ -228,7 +226,10 @@ def _guess_datetime_format_for_array( return np.array(format_checked, dtype=object) -def _try_to_repect_dayfirst(formats, dayfirst): +def _try_to_repect_dayfirst( + formats: np.ndarray, + dayfirst: bool | None, +) -> tuple[str, bool | None]: """ If several formats work as well, prefer the format which respect dayfirst. @@ -262,7 +263,16 @@ def _try_to_repect_dayfirst(formats, dayfirst): return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) -def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, exact): +def _iterative_conversion( + arg: np.ndarray, + formats: np.ndarray, + utc: bool, + unit: str | None, + errors: DateTimeErrorChoices, + dayfirst: bool | None, + yearfirst: bool | None, + exact: bool, +) -> tuple[np.ndarray, np.ndarray]: """ For mixed format, convert datetimestrings iteratively, from the best format (the format which work for most samples) @@ -327,7 +337,6 @@ def _iterative_conversion(arg, formats, utc, unit, errors, dayfirst, yearfirst, elif errors == "coerce": result[~indices_succeeded] = iNaT elif errors == "ignore": - # TODO check result = arg return result, tz_parsed From e6cf3ad1efcf7da811fe0568d8671479625856e8 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:45:46 +0200 Subject: [PATCH 22/37] Cleaning --- pandas/_libs/tslib.pyx | 4 ---- pandas/tests/tools/test_to_datetime.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8b790e3bd8adc..9b2ca61e12c29 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -69,8 +69,6 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.timestamps cimport _Timestamp -import cython - from pandas._libs.tslibs import ( Resolution, get_resolution, @@ -83,8 +81,6 @@ from libc.time cimport time from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single -# Note: this is the only non-tslibs intra-pandas dependency here - def _test_parse_iso8601(ts: str): """ diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index bf7e1f8ffc03d..b8f58dda81a5e 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3818,6 +3818,3 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # to the worst format result = to_datetime(date_str, format="mixed") tm.assert_index_equal(result, expected_mixed) - - # TODO multiple precision - # (["2011-12-30 00:00:00.000000", "2011-11-20"], "%Y-%m-%d", None) From 6998bf8ad5041073f0c19cc206014b052371291d Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:48:27 +0200 Subject: [PATCH 23/37] Typo --- doc/source/whatsnew/v2.1.0.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f60ce987552da..d5ad1e111dd64 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -17,7 +17,8 @@ Enhancements .. _whatsnew_210.enhancements.enhancement1: ``pd.to_datetime`` now tries to infer the datetime format of each string by considering -the whole Series, and tries to find the format which work for most strings. If several +a random sample (instead of the first non-null sample), +and tries to find the format which work for most strings. If several formats work as well, the one which matches the ``dayfirst`` parameter is returned. If ``format="mixed"``, pandas does the same thing, then tries the second best format on the strings which failed to parse with the first best format, and so on (:issue:`52508`). @@ -63,8 +64,6 @@ strings which failed to parse with the first best format, and so on (:issue:`525 DatetimeIndex(['2012-02-01', '2012-03-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) -^^^^^^^^^^^^ - .. _whatsnew_210.enhancements.enhancement2: ``map(func, na_action="ignore")`` now works for all array types From f98ea1f86fa3f9e933ab3c54771bd9930c054a3b Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 12 Apr 2023 15:52:00 +0200 Subject: [PATCH 24/37] comment change --- pandas/core/tools/datetimes.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d0b576c4c0a8f..66ae3f8cbb6a5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -634,14 +634,13 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - # get the list of formats which work for some of the elements - # sorted by the percentage of elements that match, highest first - # It's a list of tuples of (format, percentage of elements that match) best_format = None if format is not None and format != "mixed": best_format = format else: - # guess the format + # get a list of formats which work for some of the elements + # sorted by the percentage of elements that match, highest first + # It's a list of tuples of (format, percentage of elements that match) formats = _guess_datetime_format_for_array( arg, n_find_format=20, n_check_format=250 ) From 86aa61c264df46f1e6e8b21e46de58bdea896786 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 13 Apr 2023 13:07:32 +0200 Subject: [PATCH 25/37] simplification --- pandas/core/tools/datetimes.py | 238 ++++++++++++------------- pandas/tests/tools/test_to_datetime.py | 111 ++++++------ 2 files changed, 170 insertions(+), 179 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 66ae3f8cbb6a5..8f16d16d37a07 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -148,11 +148,63 @@ def _check_format_dayfirst(format_string: str) -> bool | None: return dayfirst +def _try_to_repect_dayfirst( + formats: np.ndarray, + dayfirst: bool | None, + warn: bool, +) -> str: + """ + If several formats work as well, prefer the format which + respect dayfirst. + + Parameters + ---------- + formats : ndarray + Array of tuples with the format and the percentage of strings that + match the format, sorted by the percentage of strings that match the + format. + dayfirst : bool + Should we prefer dayfirst formats + + Returns + ------- + best_format : str + The format among the best formats which respect dayfirst, + if any, otherwise the first best format. + """ + # Find all formats which work for + # the largest number of samples + best_formats = [ + formats_found for formats_found in formats if formats_found[1] == formats[0][1] + ] + # If several formats work as well, prefer the format which + # respect dayfirst + if len(best_formats) > 1: + for formats_found in best_formats: + if _check_format_dayfirst(formats_found[0]) == dayfirst: + return formats_found[0] + if ( + warn + and _check_format_dayfirst(best_formats[0][0]) is not None + and _check_format_dayfirst(best_formats[0][0]) != dayfirst + ): + warnings.warn( + f"Parsing dates in {best_formats[0][0]} format when " + f"dayfirst={dayfirst} was specified. " + f"Pass `dayfirst={not dayfirst}` or specify a format " + "to silence this warning.", + stacklevel=find_stack_level(), + ) + return best_formats[0][0] + + def _guess_datetime_format_for_array( arr: np.ndarray, - n_find_format: int, - n_check_format: int, -) -> np.ndarray: + dayfirst: bool | None, + n_find_format: int = 10, + n_check_format: int = 200, + warn: bool = True, +) -> str | None: """ Guess the format of the datetime strings in an array. @@ -164,6 +216,8 @@ def _guess_datetime_format_for_array( Number of strings to use to guess the format. n_check_format : int Number of strings to check for each format found. + warn: bool + Whether to warn if we contradict dayfirst Returns ------- @@ -180,8 +234,8 @@ def _guess_datetime_format_for_array( sample_check = arr[sample_idx] sample_find = sample_check[:n_find_format] if len(sample_idx) == 0: - return np.array([], dtype=object) - format_found = set() + return None + formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format # which appears when dayfirst is contradicted @@ -192,26 +246,26 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - format_found.add(guess_datetime_format(datetime_string, dayfirst=False)) - format_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + formats_found.add( + guess_datetime_format(datetime_string, dayfirst=False) + ) + formats_found.add(guess_datetime_format(datetime_string, dayfirst=True)) # remove YDM as it does not exist # but is returned by guess_datetime_format - for format_ in list(format_found): - if format_ is None or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): + for format_ in list(formats_found): + if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): # doesn't exist but is returned by guess_datetime_format - format_found.remove(format_) + formats_found.remove(format_) # Try to apply the formats found # to a larger sample - format_checked = [] - for format_ in format_found: + formats_checked = [] + for format_ in formats_found: converted = array_strptime(sample_check, fmt=format_, errors="coerce")[0] - format_checked.append( + formats_checked.append( (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) ) - # Sort by the number of strings that match the format - format_checked.sort(key=lambda x: x[1], reverse=True) if ( - len(format_checked) == 0 + len(formats_checked) == 0 and len(sample_check) > 1 and np.any([type(e) is str for e in sample_find]) # GH#32264 np.str_ objects @@ -223,49 +277,18 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) - return np.array(format_checked, dtype=object) - - -def _try_to_repect_dayfirst( - formats: np.ndarray, - dayfirst: bool | None, -) -> tuple[str, bool | None]: - """ - If several formats work as well, prefer the format which - respect dayfirst. - - Parameters - ---------- - formats : ndarray - Array of tuples with the format and the percentage of strings that - match the format, sorted by the percentage of strings that match the - format. - dayfirst : bool - Should we prefer dayfirst formats - - Returns - ------- - best_format : str - The format among the best formats which respect dayfirst, - if any, otherwise the first best format. - """ - # Find all formats which work for - # the largest number of samples - best_formats = [ - format_found for format_found in formats if format_found[1] == formats[0][1] - ] - # If several formats work as well, prefer the format which - # respect dayfirst - if len(best_formats) > 1: - for format_found in best_formats: - if _check_format_dayfirst(format_found[0]) == dayfirst: - return format_found[0], _check_format_dayfirst(format_found[0]) - return best_formats[0][0], _check_format_dayfirst(best_formats[0][0]) + if not len(formats_checked): + return None + else: + # Sort by the number of strings that match the format + formats_checked.sort(key=lambda x: x[1], reverse=True) + best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) + return best_format def _iterative_conversion( arg: np.ndarray, - formats: np.ndarray, + name: str, utc: bool, unit: str | None, errors: DateTimeErrorChoices, @@ -282,10 +305,8 @@ def _iterative_conversion( ---------- arg : ndarray Array of datetime strings. - formats : ndarray - Array of tuples with the format and the percentage of strings that - match the format, sorted by the percentage of strings that match the - format. + name : str + Name of the argument. utc : bool Whether to convert/localize timestamps to UTC. unit : str @@ -302,16 +323,18 @@ def _iterative_conversion( """ # iteratively convert the remaining samples # in "coerce" mode with the ith best format - # until all values are converted or all formats are exhausted # or 10 formats have been tried - best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] - # remove the best format from the list - formats = formats[formats[:, 0] != best_format] + # if we contradict dayfirst, we warn for the first format, but not the rest + best_format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst, warn=True) result, tz_parsed = array_strptime(arg, best_format, exact, "coerce", utc) indices_succeeded = notna(result) - for _ in range(min(len(formats), 10)): - best_format = _try_to_repect_dayfirst(formats, dayfirst)[0] - formats = formats[formats[:, 0] != best_format] + for _ in range(10): + best_format = _guess_datetime_format_for_array( + arg[~indices_succeeded], dayfirst=dayfirst, warn=False + ) + + if best_format is None: + break results_format, timezones_format = array_strptime( arg[~indices_succeeded], best_format, exact, "coerce", utc ) @@ -338,7 +361,11 @@ def _iterative_conversion( result[~indices_succeeded] = iNaT elif errors == "ignore": result = arg - return result, tz_parsed + + if any(tz is not None for tz in tz_parsed): + return _return_parsed_timezone_results(result, tz_parsed, utc, name) + + return _box_as_indexlike(result, utc=utc, name=name) def should_cache( @@ -634,64 +661,33 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - best_format = None + if format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + + # `format` could be inferred, or user didn't ask for mixed-format parsing. if format is not None and format != "mixed": - best_format = format - else: - # get a list of formats which work for some of the elements - # sorted by the percentage of elements that match, highest first - # It's a list of tuples of (format, percentage of elements that match) - formats = _guess_datetime_format_for_array( - arg, n_find_format=20, n_check_format=250 - ) - if len(formats) == 0: - result, tz_parsed = objects_to_datetime64ns( - arg, - dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, - errors=errors, - allow_object=True, - ) - if tz_parsed is not None: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) - return DatetimeIndex._simple_new(dta, name=name) - if format != "mixed" and len(formats) > 0: - # formats[0][1] is the percentage of elements that matched - if errors == "raise" and formats[0][1] != 100: - raise ValueError( - "No datetime format was found which " - "matched all values in the array.\n" - "You might want to try:\n" - " - passing `format` if your strings have a consistent format;\n" - " - passing `format='ISO8601'` if your strings are " - "all ISO8601 but not necessarily in exactly the same format;\n" - " - passing `format='mixed'`, and the format will be " - "inferred for each element individually. " - "You might want to use `dayfirst` alongside this.\n" - f"Best format found: {formats[0][0]} " - "(matched {formats[0][1]}% of the values)" - ) - best_format, best_format_dayfirst = _try_to_repect_dayfirst( - formats, dayfirst - ) - if best_format_dayfirst is not None and best_format_dayfirst != dayfirst: - warnings.warn( - f"Parsing dates in {best_format} format when " - f"dayfirst={dayfirst} was specified. " - f"Pass `dayfirst={not dayfirst}` or specify a format " - "to silence this warning.", - stacklevel=find_stack_level(), - ) - if best_format is not None: - return _array_strptime_with_fallback(arg, name, utc, best_format, exact, errors) + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + if format == "mixed": - result, tz_parsed = _iterative_conversion( - arg, formats, utc, unit, errors, dayfirst, yearfirst, exact + return _iterative_conversion( + arg, name, utc, unit, errors, dayfirst, yearfirst, exact ) + result, tz_parsed = objects_to_datetime64ns( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + allow_object=True, + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) + return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b8f58dda81a5e..5f5f3bfb1d377 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2673,7 +2673,7 @@ def test_to_datetime_dta_tz(self, klass): class TestGuessDatetimeFormat: @pytest.mark.parametrize( - "test_list, expected_formats", + "test_list, expected_format", [ ( [ @@ -2681,69 +2681,37 @@ class TestGuessDatetimeFormat: "2011-12-30 00:00:00.000000", "2011-12-30 00:00:00.000000", ], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - [np.nan, np.nan, "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["NaT", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["2011-12-30 00:00:00.000000", "random_string"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 50)], dtype=object), - ), - ( - ["now", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), - ), - ( - ["today", "2011-12-30 00:00:00.000000"], - np.array([("%Y-%m-%d %H:%M:%S.%f", 100)], dtype=object), + "%Y-%m-%d %H:%M:%S.%f", ), + ([np.nan, np.nan, "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["NaT", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["2011-12-30 00:00:00.000000", "random_string"], "%Y-%m-%d %H:%M:%S.%f"), + (["now", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), + (["today", "2011-12-30 00:00:00.000000"], "%Y-%m-%d %H:%M:%S.%f"), ( ["30-12-2012 00:00:00.000000", "12-30-2012 00:00:00.000000"], - np.array( - [("%d-%m-%Y %H:%M:%S.%f", 50), ("%m-%d-%Y %H:%M:%S.%f", 50)], - dtype=object, - ), - ), - ( - ["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], - np.array( - [("%d-%m-%Y", 33), ("%m/%d/%Y", 33), ("%Y-%m-%d %H:%M:%S.%f", 33)], - dtype=object, - ), + "%m-%d-%Y %H:%M:%S.%f", ), + (["30-12-2009", "12/30/2011", "2011-12-30 00:00:00.000000"], "%m/%d/%Y"), ], ) - def test_guess_datetime_format_for_array(self, test_list, expected_formats): + def test_guess_datetime_format_for_array(self, test_list, expected_format): test_array = np.array(test_list, dtype=object) res = tools._guess_datetime_format_for_array( - test_array, n_find_format=5, n_check_format=5 + test_array, dayfirst=False, n_find_format=5, n_check_format=5 ) - # sort according to first element of tuple (format string) to ignore order - sorted_index = np.argsort([x[0] for x in res]) - res = res[sorted_index] - sorted_index = np.argsort([x[0] for x in expected_formats]) - expected_formats = expected_formats[sorted_index] - assert (res == expected_formats).all() - # TODO more tests + assert res == expected_format @td.skip_if_not_us_locale def test_guess_datetime_format_for_array_all_nans(self): format_for_string_of_nans = tools._guess_datetime_format_for_array( np.array([np.nan, np.nan, np.nan], dtype="O"), + dayfirst=False, n_find_format=5, n_check_format=5, ) - assert len(format_for_string_of_nans) == 0 + assert format_for_string_of_nans is None class TestToDatetimeInferFormat: @@ -3647,13 +3615,20 @@ def test_multiple_dates_non_ambiguous(self, date_str, expected_format, dayfirst) result = to_datetime( date_str, errors=errors, dayfirst=try_dayfirst ) + # should also work for format="mixed" + result_mixed = to_datetime( + date_str, + errors=errors, + dayfirst=try_dayfirst, + format="mixed", + ) else: result = to_datetime(date_str, errors=errors, dayfirst=try_dayfirst) + result_mixed = to_datetime( + date_str, errors=errors, dayfirst=try_dayfirst, format="mixed" + ) tm.assert_index_equal(result, expected) - - # should also work with format="mixed" - result = to_datetime(date_str, format="mixed") - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result_mixed, expected) # ambiguous dates @pytest.mark.parametrize( @@ -3708,12 +3683,16 @@ def test_multiple_dates_ambiguous_error(self, date_str): # should raise an error with "raise" with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match="""time data "random_string" doesn't match format "%d-%m-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", ): to_datetime(date_str, errors="raise", dayfirst=True) with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match="""time data "random_string" doesn't match format "%m-%d-%Y", """ + "at position 2. " + f"{PARSING_ERR_MSG}", ): to_datetime(date_str, errors="raise", dayfirst=False) @@ -3796,12 +3775,18 @@ def test_multiple_dates_ambiguous_error(self, date_str): def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # no format works for all dates # raise should raise an error + msg = r'^time data ".*" doesn\'t match format ".*", at position .*' with pytest.raises( ValueError, - match="No datetime format was found which matched all values in the array", + match=msg, ): - to_datetime(date_str, errors="raise") - + if expected_formats[0] == "%d-%m-%Y": + # contradicting default dayfirst=False + with tm.assert_produces_warning(UserWarning): + # FIXME: do we need to raise a warning here? + to_datetime(date_str, errors="raise") + else: + to_datetime(date_str, errors="raise") # coerce and ignore should choose the format # which works for the most dates (the first one) for errors in ["coerce", "ignore"]: @@ -3816,5 +3801,15 @@ def test_multiple_dates_mixed(self, date_str, expected_formats, expected_mixed): # if format="mixed", the conversion should be done from the best format # to the worst format - result = to_datetime(date_str, format="mixed") - tm.assert_index_equal(result, expected_mixed) + for errors in ["raise", "coerce", "ignore"]: + if expected_formats[0] == "%d-%m-%Y": + # we raise a warning if the best format used + # (the one which works for the most dates) + # contradict the default dayfirst=False + with tm.assert_produces_warning(UserWarning): + result = to_datetime(date_str, format="mixed", errors=errors) + else: + # we don't raise a warning if other formats used + # contradict dayfirst + result = to_datetime(date_str, format="mixed", errors=errors) + tm.assert_index_equal(result, expected_mixed) From 8c6401b9b30fe240230138ebb83c66badffb350f Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 13 Apr 2023 13:48:55 +0200 Subject: [PATCH 26/37] remove randomness --- pandas/_libs/tslib.pyx | 10 ++++++---- pandas/core/tools/datetimes.py | 14 ++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 9b2ca61e12c29..fb96c9d6115c7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -401,8 +401,8 @@ def first_non_null(values: ndarray) -> int: @cython.wraparound(False) @cython.boundscheck(False) -def random_non_null(values: ndarray, int n) -> ndarray: - """Find n non-null values selected at random, return an array of indices.""" +def evenly_spaced_non_null(values: ndarray, int n) -> ndarray: + """Find n evenly spaced non-null values, return an array of indices.""" cdef: Py_ssize_t total = len(values) Py_ssize_t i, non_null_count @@ -422,8 +422,10 @@ def random_non_null(values: ndarray, int n) -> ndarray: non_null_count = len(non_null_indices) if non_null_count == 0 or n <= 0: return np.empty(0, dtype=np.int64) - # use np.random.choice - return np.random.choice(non_null_indices, min(n, non_null_count), replace=False) + evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1, + min(len(non_null_indices), n), + dtype=int) + return np.array(non_null_indices)[evenly_spaced_indices] @cython.wraparound(False) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8f16d16d37a07..8dd72bb3a9984 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -226,15 +226,13 @@ def _guess_datetime_format_for_array( match the format, sorted by the percentage of strings that match the format. """ - # Extract a random sample of datetime strings - assert ( - n_find_format <= n_check_format - ), "n_check_format must be greater than n_find_format" - sample_idx = tslib.random_non_null(arr, n_check_format) - sample_check = arr[sample_idx] - sample_find = sample_check[:n_find_format] - if len(sample_idx) == 0: + # Extract a sample of datetime strings + idx_find = tslib.evenly_spaced_non_null(arr, n_find_format) + if len(idx_find) == 0: return None + idx_check = tslib.evenly_spaced_non_null(arr, n_check_format) + sample_check = arr[idx_check] + sample_find = arr[idx_find] formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format From 28cf6796d23f6bf9ed8dcb537eba2a4e935cbb3e Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 10:08:45 +0200 Subject: [PATCH 27/37] fix parser tests --- pandas/core/tools/datetimes.py | 5 +++-- pandas/tests/io/parser/test_parse_dates.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8dd72bb3a9984..18a7d621072ae 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -188,9 +188,10 @@ def _try_to_repect_dayfirst( and _check_format_dayfirst(best_formats[0][0]) is not None and _check_format_dayfirst(best_formats[0][0]) != dayfirst ): + default_string = " (the default)" if not dayfirst else "" warnings.warn( f"Parsing dates in {best_formats[0][0]} format when " - f"dayfirst={dayfirst} was specified. " + f"dayfirst={dayfirst}{default_string} was specified. " f"Pass `dayfirst={not dayfirst}` or specify a format " "to silence this warning.", stacklevel=find_stack_level(), @@ -251,7 +252,7 @@ def _guess_datetime_format_for_array( # remove YDM as it does not exist # but is returned by guess_datetime_format for format_ in list(formats_found): - if (format_ is None) or re.match(r"%Y[-/_.]+%d[-/_.]+%m", format_): + if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_): # doesn't exist but is returned by guess_datetime_format formats_found.remove(format_) # Try to apply the formats found diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 55efb9254ee34..5c82b652ebace 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1794,11 +1794,13 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - with pytest.raises( - ValueError, + with tm.assert_produces_warning( + UserWarning, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:" + "Parsing dates in %d/%m/%Y format when " + "dayfirst=False \\(the default\\) was specified. " + "Pass `dayfirst=True` or specify a format " + "to silence this warning." ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) @@ -2008,10 +2010,9 @@ def test_dayfirst_warnings(): tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg): - res6 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index + res6 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index tm.assert_index_equal(expected, res6) From a22114ca6769ac645f2f6d9a611a4bd88176b3ab Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:18:41 +0200 Subject: [PATCH 28/37] simplify getting evenly spaced non null --- pandas/_libs/tslib.pyi | 1 - pandas/_libs/tslib.pyx | 32 +------------------------------- pandas/core/tools/datetimes.py | 17 ++++++++++++----- 3 files changed, 13 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index bd8748cd2650a..9819b5173db56 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -17,7 +17,6 @@ def array_with_unit_to_datetime( errors: str = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... -def random_non_null(values: np.ndarray, n: int) -> np.ndarray: ... def array_to_datetime( values: npt.NDArray[np.object_], errors: str = ..., diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index fb96c9d6115c7..106f203a16855 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -75,8 +75,7 @@ from pandas._libs.tslibs import ( ) from pandas._libs.tslibs.timestamps import Timestamp -from libc.stdlib cimport srand -from libc.time cimport time +# Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single @@ -399,35 +398,6 @@ def first_non_null(values: ndarray) -> int: return -1 -@cython.wraparound(False) -@cython.boundscheck(False) -def evenly_spaced_non_null(values: ndarray, int n) -> ndarray: - """Find n evenly spaced non-null values, return an array of indices.""" - cdef: - Py_ssize_t total = len(values) - Py_ssize_t i, non_null_count - list non_null_indices = [] - srand(time(NULL)) - for i in range(total): - val = values[i] - if checknull_with_nat_and_na(val): - continue - if ( - isinstance(val, str) - and - (len(val) == 0 or val in nat_strings or val in ("now", "today")) - ): - continue - non_null_indices.append(i) - non_null_count = len(non_null_indices) - if non_null_count == 0 or n <= 0: - return np.empty(0, dtype=np.int64) - evenly_spaced_indices = np.linspace(0, len(non_null_indices) - 1, - min(len(non_null_indices), n), - dtype=int) - return np.array(non_null_indices)[evenly_spaced_indices] - - @cython.wraparound(False) @cython.boundscheck(False) cpdef array_to_datetime( diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 18a7d621072ae..bd5c128021a90 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -228,12 +228,19 @@ def _guess_datetime_format_for_array( format. """ # Extract a sample of datetime strings - idx_find = tslib.evenly_spaced_non_null(arr, n_find_format) - if len(idx_find) == 0: + # ignore missing + arr_non_null = arr[notna(arr)] + arr_non_null = arr_non_null[ + ~np.isin(arr_non_null, ["", "now", "today"] + list(nat_strings)) + ] + if len(arr_non_null) == 0: return None - idx_check = tslib.evenly_spaced_non_null(arr, n_check_format) - sample_check = arr[idx_check] - sample_find = arr[idx_find] + # get evenly spaced non-null indices + step_find = max(len(arr_non_null) // n_find_format, 1) + step_check = max(len(arr_non_null) // n_check_format, 1) + sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] + sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] + # try formats formats_found = set() for datetime_string in sample_find: # catch warnings from guess_datetime_format From 75bb8f64490d93090e62949709779de44acf28c9 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:32:02 +0200 Subject: [PATCH 29/37] update io readme --- doc/source/user_guide/io.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 60353dde5683f..ec677ea1030c0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -977,11 +977,10 @@ Note that format inference is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. -If you try to parse a column of date strings, pandas will attempt to guess the format -from the first non-NaN element, and will then parse the rest of the column with that -format. If pandas fails to guess the format (for example if your first string is -``'01 December US/Pacific 2000'``), then a warning will be raised and each -row will be parsed individually by ``dateutil.parser.parse``. The safest +If you try to parse a column of date strings, pandas will attempt to find the format +which work best from a sample of non-NaN elements, and will then parse the rest of the +column with that format. If pandas fails to guess the format, then a warning will be +raised and each row will be parsed individually by ``dateutil.parser.parse``. The safest way to parse dates is to explicitly set ``format=``. .. ipython:: python @@ -994,7 +993,9 @@ way to parse dates is to explicitly set ``format=``. df In the case that you have mixed datetime formats within the same column, you can -pass ``format='mixed'`` +pass ``format='mixed'``. Pandas will convert rows to the best format found (the one +which matches the most rows), and then iteratively convert the remaining rows with the +remaining formats. .. ipython:: python From 6f155b5fd76925b23038a688a9608e7d5fd9559c Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 11:41:33 +0200 Subject: [PATCH 30/37] revert changed tests --- pandas/tests/tools/test_to_datetime.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5f5f3bfb1d377..9e59f047142ed 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1302,7 +1302,10 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=(f"{PARSING_ERR_MSG}"), + match=( + r'^time data "True" doesn\'t match format "%Y%m%d", ' + f"at position 1. {PARSING_ERR_MSG}$" + ), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -2390,7 +2393,10 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = rf"{PARSING_ERR_MSG}" + msg = ( + r'^time data " " doesn\'t match format "%m/%d/%Y", ' + rf"at position 2. {PARSING_ERR_MSG}$" + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2735,7 +2741,10 @@ def test_to_datetime_infer_datetime_format_consistent_format( def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) - msg = f"{PARSING_ERR_MSG}" + msg = ( + r'^time data "01-02-2011 00:00:00" doesn\'t match format ' + rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) From 2b2648e7fc19008beed7fde85ae67642f14051c5 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 12:51:40 +0200 Subject: [PATCH 31/37] fix type hints --- doc/source/whatsnew/v0.19.0.rst | 2 +- pandas/core/tools/datetimes.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index d4b879f137698..a0684db51c53e 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -765,7 +765,7 @@ Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python - + :okwarning: pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bd5c128021a90..589d8ce9137dc 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -149,7 +149,7 @@ def _check_format_dayfirst(format_string: str) -> bool | None: def _try_to_repect_dayfirst( - formats: np.ndarray, + formats: list, dayfirst: bool | None, warn: bool, ) -> str: @@ -241,7 +241,7 @@ def _guess_datetime_format_for_array( sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] # try formats - formats_found = set() + formats_found = [] for datetime_string in sample_find: # catch warnings from guess_datetime_format # which appears when dayfirst is contradicted @@ -252,14 +252,17 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - formats_found.add( + formats_found.append( guess_datetime_format(datetime_string, dayfirst=False) ) - formats_found.add(guess_datetime_format(datetime_string, dayfirst=True)) + formats_found.append( + guess_datetime_format(datetime_string, dayfirst=True) + ) + formats_found = [format_ for format_ in formats_found if format_ is not None] # remove YDM as it does not exist # but is returned by guess_datetime_format - for format_ in list(formats_found): - if (format_ is None) or re.match(r".*%Y.*%d.*%m.*", format_): + for format_ in np.unique(formats_found): + if re.match(r".*%Y.*%d.*%m.*", format_): # doesn't exist but is returned by guess_datetime_format formats_found.remove(format_) # Try to apply the formats found @@ -283,25 +286,27 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) + print(formats_checked) if not len(formats_checked): return None else: # Sort by the number of strings that match the format formats_checked.sort(key=lambda x: x[1], reverse=True) best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) + print(best_format) return best_format def _iterative_conversion( arg: np.ndarray, - name: str, + name: Hashable, utc: bool, unit: str | None, errors: DateTimeErrorChoices, dayfirst: bool | None, yearfirst: bool | None, exact: bool, -) -> tuple[np.ndarray, np.ndarray]: +) -> Index: """ For mixed format, convert datetimestrings iteratively, from the best format (the format which work for most samples) @@ -312,7 +317,7 @@ def _iterative_conversion( arg : ndarray Array of datetime strings. name : str - Name of the argument. + None or string for the Index name utc : bool Whether to convert/localize timestamps to UTC. unit : str @@ -537,7 +542,7 @@ def _convert_and_box_cache( def _return_parsed_timezone_results( - result: np.ndarray, timezones, utc: bool, name: str + result: np.ndarray, timezones, utc: bool, name: Hashable ) -> Index: """ Return results from array_strptime if a %z or %Z directive was passed. @@ -995,6 +1000,7 @@ def to_datetime( - "mixed", to allow for multiple formats. Values will be parsed iteratively using the most promising format at each step. This is risky, and you should probably use it along with `dayfirst`. + exact : bool, default True Control how `format` is used: From 3f02e0a5782b99ef0cc232c85fafe34ec70ec136 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 14:42:50 +0200 Subject: [PATCH 32/37] fix type hints for np.unique --- pandas/core/tools/datetimes.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 589d8ce9137dc..7c3299b57ddda 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -252,13 +252,12 @@ def _guess_datetime_format_for_array( message="Parsing dates in .* format when dayfirst=.* was specified.", ) if type(datetime_string) is str: - formats_found.append( - guess_datetime_format(datetime_string, dayfirst=False) - ) - formats_found.append( - guess_datetime_format(datetime_string, dayfirst=True) - ) - formats_found = [format_ for format_ in formats_found if format_ is not None] + for try_dayfirst in [False, True]: + format_found = guess_datetime_format( + datetime_string, dayfirst=try_dayfirst + ) + if format_found is not None: + formats_found.append(format_found) # remove YDM as it does not exist # but is returned by guess_datetime_format for format_ in np.unique(formats_found): From feaa7a365285240c6008783568d8db3b1d8d3a50 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 15:24:32 +0200 Subject: [PATCH 33/37] remove prints --- pandas/core/tools/datetimes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7c3299b57ddda..401f9a8bc7909 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -285,14 +285,12 @@ def _guess_datetime_format_for_array( UserWarning, stacklevel=find_stack_level(), ) - print(formats_checked) if not len(formats_checked): return None else: # Sort by the number of strings that match the format formats_checked.sort(key=lambda x: x[1], reverse=True) best_format = _try_to_repect_dayfirst(formats_checked, dayfirst, warn) - print(best_format) return best_format From 60148b18448c35de36cef1122027030b1c150822 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 17:12:59 +0200 Subject: [PATCH 34/37] fix doc --- doc/source/whatsnew/v0.19.0.rst | 1 + pandas/core/tools/datetimes.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index a0684db51c53e..74a4bebef84c5 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -766,6 +766,7 @@ This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python :okwarning: + pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 401f9a8bc7909..c0a52a73c4f0d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -213,16 +213,18 @@ def _guess_datetime_format_for_array( ---------- arr : ndarray Array of datetime strings. + dayfirst : bool + dayfirst parsing behavior from to_datetime. n_find_format : int Number of strings to use to guess the format. n_check_format : int Number of strings to check for each format found. - warn: bool - Whether to warn if we contradict dayfirst + warn : bool + Whether to warn if we contradict dayfirst. Returns ------- - formats : ndarray + ndarray Array of tuples with the format and the percentage of strings that match the format, sorted by the percentage of strings that match the format. @@ -1181,8 +1183,8 @@ def to_datetime( If multiple datetime formats are possible for a value, pandas will try to infer the most plausible format using the other examples. - >>> pd.to_datetime(["01-02-2012", "30-01-2012"]) - DatetimeIndex(['2012-02-01', '2012-01-30'], dtype='datetime64[ns]', freq=None) + >>> pd.to_datetime(["01-02-2012", "02-30-2012"]) + DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: From 6622eba05b28ec0544a10446d5f0df83a22c5e93 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 17:58:08 +0200 Subject: [PATCH 35/37] fix example with febuary 30th --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c0a52a73c4f0d..922cb2f396702 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1183,8 +1183,8 @@ def to_datetime( If multiple datetime formats are possible for a value, pandas will try to infer the most plausible format using the other examples. - >>> pd.to_datetime(["01-02-2012", "02-30-2012"]) - DatetimeIndex(['2012-01-02', '2012-02-30'], dtype='datetime64[ns]', freq=None) + >>> pd.to_datetime(["01-02-2012", "02-27-2012"]) + DatetimeIndex(['2012-01-02', '2012-02-27'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: From 23b28b93585dd7f76a51a5ab52b4a84af89426ae Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 14 Apr 2023 20:21:28 +0200 Subject: [PATCH 36/37] fix doc --- pandas/core/tools/datetimes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 922cb2f396702..59b44d8f8aba7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -996,9 +996,10 @@ def to_datetime( - "ISO8601", to parse any `ISO8601 `_ time string (not necessarily in exactly the same format); + - "mixed", to allow for multiple formats. Values will be parsed iteratively - using the most promising format at each step. This is risky, - and you should probably use it along with `dayfirst`. + using the most promising format at each step. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: From 5422bfa3c257d9472533cfcb3f0fa3cc483495d3 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 24 Apr 2023 16:09:19 +0200 Subject: [PATCH 37/37] check if any str at the beginning of _guess_datetime_format_for_array --- pandas/core/tools/datetimes.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 59b44d8f8aba7..9e4b67bb265c7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -242,6 +242,9 @@ def _guess_datetime_format_for_array( step_check = max(len(arr_non_null) // n_check_format, 1) sample_check = arr_non_null[np.arange(0, len(arr_non_null), step_check)] sample_find = arr_non_null[np.arange(0, len(arr_non_null), step_find)] + if not np.any([type(e) is str for e in sample_find]): + # GH#32264 np.str_ objects + return None # try formats formats_found = [] for datetime_string in sample_find: @@ -274,20 +277,15 @@ def _guess_datetime_format_for_array( formats_checked.append( (format_, int(100 * np.sum(~np.isnan(converted)) / len(converted))) ) - if ( - len(formats_checked) == 0 - and len(sample_check) > 1 - and np.any([type(e) is str for e in sample_find]) - # GH#32264 np.str_ objects - ): - warnings.warn( - "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " - "consistent and as-expected, please specify a format.", - UserWarning, - stacklevel=find_stack_level(), - ) if not len(formats_checked): + if len(sample_check) > 1: + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually, falling back to `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) return None else: # Sort by the number of strings that match the format