From 044948f7b4fb7d39dd497031f5e25f63d7cdb4dd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 11:08:02 +0000 Subject: [PATCH 01/12] allow format iso8601 --- doc/source/user_guide/io.rst | 9 +++ doc/source/whatsnew/v2.0.0.rst | 6 ++ pandas/_libs/tslibs/strptime.pyx | 63 ++++++++++------- pandas/core/tools/datetimes.py | 12 +++- pandas/tests/io/parser/test_parse_dates.py | 3 +- pandas/tests/tools/test_to_datetime.py | 67 +++++++++++++++---- .../0004-consistent-to-datetime-parsing.md | 8 ++- 7 files changed, 129 insertions(+), 39 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index dc21b9f35d272..229f340b854ec 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1011,6 +1011,15 @@ first read it in as an object dtype and then apply :func:`to_datetime` to each e df['date'] = df['date'].apply(pd.to_datetime) df +or, if your datetime formats are all ISO8601: + +.. ipython:: python + + data = io.StringIO("date\n2020-01-01\n2020-01-01 03:00\n") + df = pd.read_csv(data) + df['date'] = pd.to_datetime(df['date'], format='ISO8601') + df + .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 605f1d4b26e13..707ffc55bd05e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -168,6 +168,7 @@ Other enhancements - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) +- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (:issue:`50411`) - .. --------------------------------------------------------------------------- @@ -559,6 +560,11 @@ to each element individually, e.g. :: ser = pd.Series(['13-01-2000', '12 January 2000']) ser.apply(pd.to_datetime) +or, if your formats are all ISO8601, :: + + ser = pd.Series(['2020-01-01', '2020-01-01 03:00']) + pd.to_datetime(ser, format='ISO8601') + .. _whatsnew_200.api_breaking.other: Other API changes diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 3ca87f8680b53..4f5543fff6dcc 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -152,6 +152,7 @@ cdef dict _parse_code_table = {"y": 0, def array_strptime( ndarray[object] values, str fmt, + bint fmt_inferred=False, bint exact=True, errors="raise", bint utc=False, @@ -186,6 +187,7 @@ def array_strptime( bint iso_format = format_is_iso(fmt) NPY_DATETIMEUNIT out_bestunit int out_local = 0, out_tzoffset = 0 + bint string_to_dts_succeeded = 0 assert is_raise or is_ignore or is_coerce @@ -306,43 +308,58 @@ def array_strptime( else: val = str(val) - if iso_format: - string_to_dts_failed = string_to_dts( + if fmt == "ISO8601": + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, None, False + ) + elif iso_format: + string_to_dts_succeeded = not string_to_dts( val, &dts, &out_bestunit, &out_local, &out_tzoffset, False, fmt, exact ) - if not string_to_dts_failed: - # No error reported by string_to_dts, pick back up - # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - if out_local == 1: - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - tz = timezone(timedelta(minutes=out_tzoffset)) - result_timezone[i] = tz - out_local = 0 - out_tzoffset = 0 - iresult[i] = value - check_dts_bounds(&dts) - continue + if string_to_dts_succeeded: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + tz = timezone(timedelta(minutes=out_tzoffset)) + result_timezone[i] = tz + out_local = 0 + out_tzoffset = 0 + iresult[i] = value + check_dts_bounds(&dts) + continue + else: + if fmt == "ISO8601": + raise ValueError(f"Time data {val} is not ISO8601 format") if parse_today_now(val, &iresult[i], utc): continue # Some ISO formats can't be parsed by string_to_dts - # For example, 6-digit YYYYMD. So, if there's an error, - # try the string-matching code below. + # For example, 6-digit YYYYMD. So, if there's an error, and a format + # was specified, then try the string-matching code below. If the format + # specified was 'ISO8601', then we need to error, because + # only string_to_dts handles mixed ISO8601 formats. + if not string_to_dts_succeeded and fmt == "ISO8601": + raise ValueError(f"Time data {val} is not ISO8601 format") # exact matching if exact: found = format_regex.match(val) if not found: - raise ValueError(f"time data \"{val}\" doesn't " - f"match format \"{fmt}\"") + raise ValueError( + f"time data \"{val}\" doesn't " + f"match {'(inferred) '*fmt_inferred}format \"{fmt}\"" + ) if len(val) != found.end(): raise ValueError( - f"unconverted data remains: " + "unconverted data remains when parsing with " + f"{'(inferred) '*fmt_inferred}format \"{fmt}\": " f'"{val[found.end():]}"' ) @@ -352,7 +369,7 @@ def array_strptime( if not found: raise ValueError( f"time data \"{val}\" doesn't match " - f"format \"{fmt}\"" + f"{'(inferred) '*fmt_inferred}format \"{fmt}\"" ) iso_year = -1 diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 152bfcb8822a4..1895afb91c049 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -442,11 +442,15 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) + format_inferred = False if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + format_inferred = True if format is not None: - return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + return _array_strptime_with_fallback( + arg, name, utc, format, format_inferred, exact, errors + ) result, tz_parsed = objects_to_datetime64ns( arg, @@ -471,13 +475,16 @@ def _array_strptime_with_fallback( name, utc: bool, fmt: str, + fmt_inferred: bool, exact: bool, errors: str, ) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + result, timezones = array_strptime( + arg, fmt, fmt_inferred=fmt_inferred, exact=exact, errors=errors, utc=utc + ) if any(tz is not None for tz in timezones): return _return_parsed_timezone_results(result, timezones, utc, name) @@ -759,6 +766,7 @@ def to_datetime( `_ for more information on choices, though note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass "ISO8601" to parse any ISO8601 time string. exact : bool, default True Control how `format` is used: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index fc477a899d089..d7f21378309d2 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1719,7 +1719,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): with pytest.raises( ValueError, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", at position 1$' + r'^time data "31/05/2000" doesn\'t match \(inferred\) format "%m/%d/%Y", ' + r"at position 1$" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3509c82d2af6d..7e841b6bda416 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -133,7 +133,11 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser2 = ser.apply(str) ser2[2] = "nat" with pytest.raises( - ValueError, match='unconverted data remains: ".0", at position 0' + ValueError, + match=( + 'unconverted data remains when parsing with format "%Y%m%d": ".0", ' + "at position 0", + ), ): # https://github.com/pandas-dev/pandas/issues/50051 to_datetime(ser2, format="%Y%m%d", cache=cache) @@ -528,7 +532,8 @@ def test_to_datetime_parse_timezone_malformed(self, offset): msg = "|".join( [ r'^time data ".*" doesn\'t match format ".*", at position 0$', - r'^unconverted data remains: ".*", at position 0$', + r'^unconverted data remains when parsing with format ".*": ".*", ' + "at position 0$", ] ) with pytest.raises(ValueError, match=msg): @@ -1288,7 +1293,10 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=r'^time data "True" doesn\'t match format "%Y%m%d", at position 1$', + match=( + r'^time data "True" doesn\'t match \(inferred\) format "%Y%m%d", ' + "at position 1$" + ), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -1331,7 +1339,8 @@ def test_datetime_invalid_scalar(self, value, format, warning): [ r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', r'^Given date string "a" not likely a datetime, at position 0$', - r'^unconverted data remains: "9", at position 0$', + r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' + "at position 0$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -1382,7 +1391,8 @@ def test_datetime_invalid_index(self, values, format, warning): [ r'^Given date string "a" not likely a datetime, at position 0$', r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', - r'^unconverted data remains: "9", at position 0$', + r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' + "at position 0$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -2143,8 +2153,8 @@ def test_dataframe_float(self, cache): # float df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( - r"^cannot assemble the datetimes: unconverted data remains: " - r'"1", at position 0$' + r"^cannot assemble the datetimes: unconverted data remains when parsing " + r'with format ".*": "1", at position 0$' ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2226,7 +2236,8 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` msg = "|".join( [ - '^unconverted data remains: ".*", at position 0$', + '^unconverted data remains when parsing with format ".*": ".*"' + ", at position 0$", 'time data ".*" doesn\'t match format ".*", at position 0', ] ) @@ -2360,7 +2371,10 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r'^time data " " doesn\'t match format "%m/%d/%Y", at position 2$' + msg = ( + r'^time data " " doesn\'t match \(inferred\) format "%m/%d/%Y", ' + "at position 2$" + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2624,7 +2638,7 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, match=( - r'^time data "03/30/2011" doesn\'t match format ' + r'^time data "03/30/2011" doesn\'t match \(inferred\) format ' r'"%d/%m/%Y", at position 1$' ), ): @@ -2695,7 +2709,7 @@ def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) msg = ( - r'^time data "01-02-2011 00:00:00" doesn\'t match format ' + r'^time data "01-02-2011 00:00:00" doesn\'t match \(inferred\) format ' r'"%m/%d/%Y %H:%M:%S", at position 1$' ) with pytest.raises(ValueError, match=msg): @@ -2835,7 +2849,8 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-32", "%Y-%m-%d", - '^unconverted data remains: "2", at position 0$', + '^unconverted data remains when parsing with format "%Y-%d-%m": "2", ' + "at position 0$", ), ( "2015-32-02", @@ -3490,3 +3505,31 @@ def test_to_datetime_format_f_parse_nanos(): nanosecond=789, ) assert result == expected + + +def test_to_datetime_mixed_iso8601(): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["2020-01-01", "2020-01-01 05:00:00"], format="ISO8601") + expected = DatetimeIndex(["2020-01-01 00:00:00", "2020-01-01 05:00:00"]) + tm.assert_index_equal(result, expected) + + +def test_to_datetime_mixed_not_necessarily_iso8601_raise(): + # https://github.com/pandas-dev/pandas/issues/50411 + with pytest.raises( + ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1" + ): + to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601") + + +@pytest.mark.parametrize( + ("errors", "expected"), + [ + ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), + ("ignore", Index(["2020-01-01", "01-01-2000"])), + ], +) +def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601", errors=errors) + tm.assert_index_equal(result, expected) diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md index 7635fabe2dbc6..3ad58985ef727 100644 --- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md +++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md @@ -4,7 +4,7 @@ - Status: Accepted - Discussion: [#48621](https://github.com/pandas-dev/pandas/pull/48621) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) -- Revision: 1 +- Revision: 2 ## Abstract @@ -64,6 +64,11 @@ Out[3]: 1 2000-01-13 dtype: datetime64[ns] ``` +or, if their dates are all ISO8601, +```ipython +In [4]: pd.to_datetime(['2020-01-01', '2020-01-01 03:00'], format='ISO8601') +Out[4]: DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 03:00:00'], dtype='datetime64[ns]', freq=None) +``` ## Usage and Impact @@ -99,3 +104,4 @@ We could make ``guess_datetime_format`` smarter by using a random sample of elem ### PDEP History - 18 September 2022: Initial draft +- 23 January 2023: Amended to mention ``format='ISO8601'`` option From f4e139251afdc07f617d0fed047844e6f6dc2592 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 11:12:00 +0000 Subject: [PATCH 02/12] fixup tests --- pandas/tests/tools/test_to_datetime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7e841b6bda416..1dceaab2e2fb8 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -136,7 +136,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ValueError, match=( 'unconverted data remains when parsing with format "%Y%m%d": ".0", ' - "at position 0", + "at position 0" ), ): # https://github.com/pandas-dev/pandas/issues/50051 @@ -2849,7 +2849,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-32", "%Y-%m-%d", - '^unconverted data remains when parsing with format "%Y-%d-%m": "2", ' + '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' "at position 0$", ), ( From 9f06d8006165785a6dfa8612e960b2528d9735fe Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 12:54:18 +0000 Subject: [PATCH 03/12] :label: typing --- pandas/_libs/tslibs/strptime.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 4565bb7ecf959..78a40a896b2ff 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -5,6 +5,7 @@ from pandas._typing import npt def array_strptime( values: npt.NDArray[np.object_], fmt: str | None, + fmt_inferred: bool = ..., exact: bool = ..., errors: str = ..., utc: bool = ..., From d7f60567fea679d62db8139cf39d580ddd9c1b12 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 13:05:32 +0000 Subject: [PATCH 04/12] remove duplicate code --- pandas/_libs/tslibs/strptime.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 4f5543fff6dcc..1733d6fafd58d 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -333,9 +333,6 @@ def array_strptime( iresult[i] = value check_dts_bounds(&dts) continue - else: - if fmt == "ISO8601": - raise ValueError(f"Time data {val} is not ISO8601 format") if parse_today_now(val, &iresult[i], utc): continue From 6e6d579d3da6c1c0e058d08fcb0a9d0340ed69f6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 15:15:01 +0000 Subject: [PATCH 05/12] improve message, use if-statement --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v2.0.0.rst | 4 ++-- pandas/_libs/tslibs/strptime.pyx | 41 ++++++++++++++++++++++---------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 229f340b854ec..9de15c6341e91 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1011,7 +1011,7 @@ first read it in as an object dtype and then apply :func:`to_datetime` to each e df['date'] = df['date'].apply(pd.to_datetime) df -or, if your datetime formats are all ISO8601: +or, if your datetime formats are all ISO8601 (but possibly not identically-formatted): .. ipython:: python diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 707ffc55bd05e..4df66f9330d3b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -168,7 +168,7 @@ Other enhancements - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) -- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (:issue:`50411`) +- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`) - .. --------------------------------------------------------------------------- @@ -560,7 +560,7 @@ to each element individually, e.g. :: ser = pd.Series(['13-01-2000', '12 January 2000']) ser.apply(pd.to_datetime) -or, if your formats are all ISO8601, :: +or, if your formats are all ISO8601 (but possibly not identically-formatted) :: ser = pd.Series(['2020-01-01', '2020-01-01 03:00']) pd.to_datetime(ser, format='ISO8601') diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 1733d6fafd58d..22229f688c04e 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -349,25 +349,40 @@ def array_strptime( if exact: found = format_regex.match(val) if not found: - raise ValueError( - f"time data \"{val}\" doesn't " - f"match {'(inferred) '*fmt_inferred}format \"{fmt}\"" - ) + if fmt_inferred: + raise ValueError( + f"time data \"{val}\" doesn't " + f"match (inferred) format \"{fmt}\"" + ) + else: + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) if len(val) != found.end(): - raise ValueError( - "unconverted data remains when parsing with " - f"{'(inferred) '*fmt_inferred}format \"{fmt}\": " - f'"{val[found.end():]}"' - ) + if fmt_inferred: + raise ValueError( + "unconverted data remains when parsing with " + f"(inferred) format \"{fmt}\": \"{val[found.end():]}\"" + ) + else: + raise ValueError( + "unconverted data remains when parsing with " + f"format \"{fmt}\": \"{val[found.end():]}\"" + ) # search else: found = format_regex.search(val) if not found: - raise ValueError( - f"time data \"{val}\" doesn't match " - f"{'(inferred) '*fmt_inferred}format \"{fmt}\"" - ) + if fmt_inferred: + raise ValueError( + f"time data \"{val}\" doesn't match " + f"(inferred) format \"{fmt}\"" + ) + else: + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) iso_year = -1 year = 1900 From b247bbd2aa04d13ec72c0c5721962f50aee10028 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 23 Jan 2023 17:43:04 +0000 Subject: [PATCH 06/12] note that exact has no effect if format=iso8601 --- pandas/core/tools/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1895afb91c049..0c13ee3ab2ffb 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -774,6 +774,7 @@ def to_datetime( - If :const:`False`, allow the `format` to match anywhere in the target string. + Note that if ``format='ISO8601'`` then `exact` has no effect. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. From 262be89f5851e0339d258eb16e5705dbaa713560 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 25 Jan 2023 10:32:33 +0000 Subject: [PATCH 07/12] point to format=ISO8601 in error message --- pandas/_libs/tslibs/strptime.pyx | 7 +++++- pandas/tests/tools/test_to_datetime.py | 32 ++++++++++++++++++-------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 22229f688c04e..ce6348b8d4538 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -533,7 +533,12 @@ def array_strptime( result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: - ex.args = (f"{str(ex)}, at position {i}",) + if iso_format: + ex.args = (f"{str(ex)}, at position {i}. If your time strings " + "are all (not-necessarily-identically-formatted) ISO8601, " + "you could try passing 'format=\"ISO8601\"'",) + else: + ex.args = (f"{str(ex)}, at position {i}",) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 54e94495ddcba..bb0495ba3acaa 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1295,7 +1295,9 @@ def test_datetime_bool_arrays_mixed(self, cache): ValueError, match=( r'^time data "True" doesn\'t match \(inferred\) format "%Y%m%d", ' - "at position 1$" + "at position 1. If your time strings are all " + r"\(not-necessarily-identically-formatted\) ISO8601, you could " + "try passing 'format=\"ISO8601\"'$" ), ): to_datetime(["20130101", True], cache=cache) @@ -2093,7 +2095,9 @@ def test_dataframe_coerce(self, cache): msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1$' + r'match format "%Y%m%d", at position 1. ' + r"If your time strings are all \(not-necessarily-identically-formatted\) " + "ISO8601, you could try passing 'format=\"ISO8601\"'$" ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -2171,7 +2175,9 @@ def test_dataframe_float(self, cache): df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( r"^cannot assemble the datetimes: unconverted data remains when parsing " - r'with format ".*": "1", at position 0$' + r'with format ".*": "1", at position 0. ' + r"If your time strings are all \(not-necessarily-identically-formatted\) " + "ISO8601, you could try passing 'format=\"ISO8601\"'$" ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2254,7 +2260,9 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): msg = "|".join( [ '^unconverted data remains when parsing with format ".*": ".*"' - ", at position 0$", + ", at position 0. " + r"If your time strings are all \(not-necessarily-identically-" + r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", 'time data ".*" doesn\'t match format ".*", at position 0', ] ) @@ -2856,7 +2864,9 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - "^day is out of range for month, at position 0$", + "^day is out of range for month, at position 0. " + r"If your time strings are all \(not-necessarily-identically-" + r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", ), ( "2015-29-02", @@ -2867,7 +2877,9 @@ def test_day_not_in_month_raise(self, cache): "2015-02-32", "%Y-%m-%d", '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' - "at position 0$", + "at position 0. " + r"If your time strings are all \(not-necessarily-identically-" + r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", ), ( "2015-32-02", @@ -2878,7 +2890,9 @@ def test_day_not_in_month_raise(self, cache): ( "2015-04-31", "%Y-%m-%d", - "^day is out of range for month, at position 0$", + "^day is out of range for month, at position 0. " + r"If your time strings are all \(not-necessarily-identically-" + r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", ), ( "2015-31-04", @@ -3290,9 +3304,7 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = ( - r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0$" - ) + msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" with pytest.raises(OutOfBoundsDatetime, match=msg): with tm.assert_produces_warning(warning, match="Could not infer format"): to_datetime("2417-10-10 00:00:00", format=format) From e01b6eedaacd7fd0c06779d7acc5837db5281617 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 25 Jan 2023 19:54:15 +0000 Subject: [PATCH 08/12] allow format="mixed" --- doc/source/user_guide/io.rst | 8 +- doc/source/whatsnew/v2.0.0.rst | 7 +- pandas/_libs/tslibs/strptime.pyi | 1 - pandas/_libs/tslibs/strptime.pyx | 53 ++++------- pandas/core/tools/datetimes.py | 29 +++--- pandas/tests/io/parser/test_parse_dates.py | 4 +- pandas/tests/tools/test_to_datetime.py | 92 +++++++++++-------- .../0004-consistent-to-datetime-parsing.md | 2 +- 8 files changed, 97 insertions(+), 99 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9de15c6341e91..123737746bc8b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1001,17 +1001,17 @@ way to parse dates is to explicitly set ``format=``. ) df -In the case that you have mixed datetime formats within the same column, you'll need to -first read it in as an object dtype and then apply :func:`to_datetime` to each element. +In the case that you have mixed datetime formats within the same column, you can +pass ``format='mixed'`` .. ipython:: python data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") df = pd.read_csv(data) - df['date'] = df['date'].apply(pd.to_datetime) + df['date'] = pd.to_datetime(df['date'], format='mixed') df -or, if your datetime formats are all ISO8601 (but possibly not identically-formatted): +or, if your datetime formats are all ISO8601 (possibly not identically-formatted): .. ipython:: python diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8fa2ae7a332fb..014399f7b1c4c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -188,6 +188,7 @@ Other enhancements - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) - :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`) +- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`) - .. --------------------------------------------------------------------------- @@ -573,11 +574,11 @@ In the past, :func:`to_datetime` guessed the format for each element independent Note that this affects :func:`read_csv` as well. -If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime` -to each element individually, e.g. :: +If you still need to parse dates with inconsistent formats, you can use +``format='mixed`` (preferably alongside ``dayfirst``) :: ser = pd.Series(['13-01-2000', '12 January 2000']) - ser.apply(pd.to_datetime) + pd.to_datetime(ser, format='mixed', dayfirst=True) or, if your formats are all ISO8601 (but possibly not identically-formatted) :: diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 78a40a896b2ff..4565bb7ecf959 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -5,7 +5,6 @@ from pandas._typing import npt def array_strptime( values: npt.NDArray[np.object_], fmt: str | None, - fmt_inferred: bool = ..., exact: bool = ..., errors: str = ..., utc: bool = ..., diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ce6348b8d4538..bc1a0ba1c95ad 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -152,7 +152,6 @@ cdef dict _parse_code_table = {"y": 0, def array_strptime( ndarray[object] values, str fmt, - bint fmt_inferred=False, bint exact=True, errors="raise", bint utc=False, @@ -349,40 +348,22 @@ def array_strptime( if exact: found = format_regex.match(val) if not found: - if fmt_inferred: - raise ValueError( - f"time data \"{val}\" doesn't " - f"match (inferred) format \"{fmt}\"" - ) - else: - raise ValueError( - f"time data \"{val}\" doesn't match format \"{fmt}\"" - ) + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) if len(val) != found.end(): - if fmt_inferred: - raise ValueError( - "unconverted data remains when parsing with " - f"(inferred) format \"{fmt}\": \"{val[found.end():]}\"" - ) - else: - raise ValueError( - "unconverted data remains when parsing with " - f"format \"{fmt}\": \"{val[found.end():]}\"" - ) + raise ValueError( + "unconverted data remains when parsing with " + f"format \"{fmt}\": \"{val[found.end():]}\"" + ) # search else: found = format_regex.search(val) if not found: - if fmt_inferred: - raise ValueError( - f"time data \"{val}\" doesn't match " - f"(inferred) format \"{fmt}\"" - ) - else: - raise ValueError( - f"time data \"{val}\" doesn't match format \"{fmt}\"" - ) + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) iso_year = -1 year = 1900 @@ -533,12 +514,14 @@ def array_strptime( result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: - if iso_format: - ex.args = (f"{str(ex)}, at position {i}. If your time strings " - "are all (not-necessarily-identically-formatted) ISO8601, " - "you could try passing 'format=\"ISO8601\"'",) - else: - ex.args = (f"{str(ex)}, at position {i}",) + ex.args = ( + f"{str(ex)}, at position {i}. You might want to try:\n" + " - passing ``format='ISO8601'`` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing ``format='mixed'``, and the format will be " + "inferred for each element individually. " + "You might want to use ``dayfirst`` alongside this.", + ) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0c13ee3ab2ffb..4fc84453962c5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -442,15 +442,12 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - format_inferred = False - if format is None: + if format is None and format != "mixed": format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - format_inferred = True - if format is not None: - return _array_strptime_with_fallback( - arg, name, utc, format, format_inferred, exact, errors - ) + # `format` could not be inferred, or user asked for mixed-format parsing. + if format is not None and format != "mixed": + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) result, tz_parsed = objects_to_datetime64ns( arg, @@ -475,16 +472,13 @@ def _array_strptime_with_fallback( name, utc: bool, fmt: str, - fmt_inferred: bool, exact: bool, errors: str, ) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, timezones = array_strptime( - arg, fmt, fmt_inferred=fmt_inferred, exact=exact, errors=errors, utc=utc - ) + result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) if any(tz is not None for tz in timezones): return _return_parsed_timezone_results(result, timezones, utc, name) @@ -694,7 +688,7 @@ def to_datetime( yearfirst: bool = False, utc: bool = False, format: str | None = None, - exact: bool = True, + exact: bool | lib.NoDefault = lib.no_default, unit: str | None = None, infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", @@ -766,7 +760,12 @@ def to_datetime( `_ for more information on choices, though note that :const:`"%f"` will parse all the way up to nanoseconds. - You can also pass "ISO8601" to parse any ISO8601 time string. + You can also pass: + + - "ISO8601", to parse any ISO8601 time string (not necessarily in exactly the + same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: @@ -774,7 +773,7 @@ def to_datetime( - If :const:`False`, allow the `format` to match anywhere in the target string. - Note that if ``format='ISO8601'`` then `exact` has no effect. + Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. @@ -1006,6 +1005,8 @@ def to_datetime( DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if exact is not lib.no_default and format in {"mixed", "ISO8601"}: + raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d7f21378309d2..b4e4164e4e7c8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1719,8 +1719,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): with pytest.raises( ValueError, match=( - r'^time data "31/05/2000" doesn\'t match \(inferred\) format "%m/%d/%Y", ' - r"at position 1$" + r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' + r"at position 1. You might want to try:\n - passing ``format='ISO8601'``" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index bb0495ba3acaa..cb049c6755158 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -48,6 +48,15 @@ from pandas.core.tools.datetimes import start_caching_at from pandas.util.version import Version +PARSING_ERR_MSG = ( + r"You might want to try:\n" + r" - passing ``format=\'ISO8601\'`` if your strings are all ISO8601 " + r"but not necessarily in exactly the same format;\n" + r" - passing ``format=\'mixed\'``, and the format will be inferred " + r"for each element individually. You might want to use ``dayfirst`` " + r"alongside this." +) + @pytest.fixture(params=[True, False]) def cache(request): @@ -531,9 +540,10 @@ def test_to_datetime_parse_timezone_malformed(self, offset): msg = "|".join( [ - r'^time data ".*" doesn\'t match format ".*", at position 0$', + r'^time data ".*" doesn\'t match format ".*", at position 0. ' + f"{PARSING_ERR_MSG}$", r'^unconverted data remains when parsing with format ".*": ".*", ' - "at position 0$", + f"at position 0. {PARSING_ERR_MSG}$", ] ) with pytest.raises(ValueError, match=msg): @@ -1294,10 +1304,8 @@ def test_datetime_bool_arrays_mixed(self, cache): with pytest.raises( ValueError, match=( - r'^time data "True" doesn\'t match \(inferred\) format "%Y%m%d", ' - "at position 1. If your time strings are all " - r"\(not-necessarily-identically-formatted\) ISO8601, you could " - "try passing 'format=\"ISO8601\"'$" + r'^time data "True" doesn\'t match format "%Y%m%d", ' + f"at position 1. {PARSING_ERR_MSG}$" ), ): to_datetime(["20130101", True], cache=cache) @@ -1339,10 +1347,11 @@ def test_datetime_invalid_scalar(self, value, format, warning): msg = "|".join( [ - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', + r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + f"{PARSING_ERR_MSG}$", r'^Given date string "a" not likely a datetime, at position 0$', r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - "at position 0$", + f"at position 0. {PARSING_ERR_MSG}$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -1365,7 +1374,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): assert res is NaT if format is not None: - msg = r'^time data ".*" doesn\'t match format ".*", at position 0$' + msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1392,9 +1401,10 @@ def test_datetime_invalid_index(self, values, format, warning): msg = "|".join( [ r'^Given date string "a" not likely a datetime, at position 0$', - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', + r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + f"{PARSING_ERR_MSG}$", r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' - "at position 0$", + f"at position 0. {PARSING_ERR_MSG}$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -2095,9 +2105,7 @@ def test_dataframe_coerce(self, cache): msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1. ' - r"If your time strings are all \(not-necessarily-identically-formatted\) " - "ISO8601, you could try passing 'format=\"ISO8601\"'$" + r'match format "%Y%m%d", at position 1.' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -2175,9 +2183,7 @@ def test_dataframe_float(self, cache): df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( r"^cannot assemble the datetimes: unconverted data remains when parsing " - r'with format ".*": "1", at position 0. ' - r"If your time strings are all \(not-necessarily-identically-formatted\) " - "ISO8601, you could try passing 'format=\"ISO8601\"'$" + r'with format ".*": "1", at position 0.' ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2260,10 +2266,9 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): msg = "|".join( [ '^unconverted data remains when parsing with format ".*": ".*"' - ", at position 0. " - r"If your time strings are all \(not-necessarily-identically-" - r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", - 'time data ".*" doesn\'t match format ".*", at position 0', + f", at position 0. {PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*", at position 0. ' + f"{PARSING_ERR_MSG}$", ] ) with pytest.raises( @@ -2397,8 +2402,8 @@ def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) msg = ( - r'^time data " " doesn\'t match \(inferred\) format "%m/%d/%Y", ' - "at position 2$" + r'^time data " " doesn\'t match format "%m/%d/%Y", ' + rf"at position 2. {PARSING_ERR_MSG}$" ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) @@ -2663,8 +2668,8 @@ def test_dayfirst_warnings_invalid_input(self): with pytest.raises( ValueError, match=( - r'^time data "03/30/2011" doesn\'t match \(inferred\) format ' - r'"%d/%m/%Y", at position 1$' + r'^time data "03/30/2011" doesn\'t match format ' + rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' ), ): to_datetime(arr, dayfirst=True) @@ -2734,8 +2739,8 @@ def test_to_datetime_inconsistent_format(self, cache): data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) msg = ( - r'^time data "01-02-2011 00:00:00" doesn\'t match \(inferred\) format ' - r'"%m/%d/%Y %H:%M:%S", at position 1$' + r'^time data "01-02-2011 00:00:00" doesn\'t match format ' + rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -2864,40 +2869,34 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - "^day is out of range for month, at position 0. " - r"If your time strings are all \(not-necessarily-identically-" - r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-29-02", "%Y-%d-%m", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-02-32", "%Y-%m-%d", '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' - "at position 0. " - r"If your time strings are all \(not-necessarily-identically-" - r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", + f"at position 0. {PARSING_ERR_MSG}$", ), ( "2015-32-02", "%Y-%d-%m", '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m", ' - "at position 0$", + f"at position 0. {PARSING_ERR_MSG}$", ), ( "2015-04-31", "%Y-%m-%d", - "^day is out of range for month, at position 0. " - r"If your time strings are all \(not-necessarily-identically-" - r"formatted\) ISO8601, you could try passing 'format=\"ISO8601\"'$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-31-04", "%Y-%d-%m", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ], ) @@ -3543,6 +3542,21 @@ def test_to_datetime_mixed_iso8601(): tm.assert_index_equal(result, expected) +def test_to_datetime_mixed_other(): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["01/11/2000", "12 January 2000"], format="mixed") + expected = DatetimeIndex(["2000-01-11", "2000-01-12"]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("exact", [True, False]) +@pytest.mark.parametrize("format", ["ISO8601", "mixed"]) +def test_to_datetime_mixed_or_iso_exact(exact, format): + msg = "Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'" + with pytest.raises(ValueError, match=msg): + to_datetime(["2020-01-01"], exact=exact, format=format) + + def test_to_datetime_mixed_not_necessarily_iso8601_raise(): # https://github.com/pandas-dev/pandas/issues/50411 with pytest.raises( diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md index 3ad58985ef727..8404aff51d851 100644 --- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md +++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md @@ -104,4 +104,4 @@ We could make ``guess_datetime_format`` smarter by using a random sample of elem ### PDEP History - 18 September 2022: Initial draft -- 23 January 2023: Amended to mention ``format='ISO8601'`` option +- 25 January 2023: Amended to mention ``format='ISO8601'`` and ``format='mixed'`` options From 607c77d0a6e6f2a89eb2f981d478fc03dbd1b68a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 1 Feb 2023 19:23:36 +0000 Subject: [PATCH 09/12] link to iso wiki page --- pandas/core/tools/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4fc84453962c5..e90b426e54a42 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -762,8 +762,8 @@ def to_datetime( note that :const:`"%f"` will parse all the way up to nanoseconds. You can also pass: - - "ISO8601", to parse any ISO8601 time string (not necessarily in exactly the - same format); + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); - "mixed", to infer the format for each element individually. This is risky, and you should probably use it along with `dayfirst`. exact : bool, default True From 313003ed0ec405fca8e56cf2a34ddd247a9ebe81 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 10 Feb 2023 09:04:27 +0000 Subject: [PATCH 10/12] minor fixups --- pandas/core/tools/datetimes.py | 4 ++-- pandas/tests/tools/test_to_datetime.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 598c84af8c91d..4b4257e7aad6d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -442,10 +442,10 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) - if format is None and format != "mixed": + if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - # `format` could not be inferred, or user asked for mixed-format parsing. + # `format` could be inferred, or user didn't ask for mixed-format parsing. if format is not None and format != "mixed": return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5d8241ebca333..8ce04125130d5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2114,7 +2114,7 @@ def test_dataframe_coerce(self, cache): msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1.' + r'match format "%Y%m%d", at position 1\.' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) From 3b61e5bdf3065868b42fee91a2c6bad8171ddcb6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 10 Feb 2023 14:24:20 +0000 Subject: [PATCH 11/12] double backticks -> single, suggest passing format --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/tslibs/strptime.pyx | 7 ++++--- pandas/core/tools/datetimes.py | 4 +--- pandas/tests/io/parser/test_parse_dates.py | 2 +- pandas/tests/tools/test_to_datetime.py | 7 ++++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index bc52f811143a8..9eacb8148adcb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -734,7 +734,7 @@ In the past, :func:`to_datetime` guessed the format for each element independent Note that this affects :func:`read_csv` as well. If you still need to parse dates with inconsistent formats, you can use -``format='mixed`` (preferably alongside ``dayfirst``) :: +``format='mixed`` (possibly alongside ``dayfirst``) :: ser = pd.Series(['13-01-2000', '12 January 2000']) pd.to_datetime(ser, format='mixed', dayfirst=True) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 539e458cd92cb..cf847746f16cd 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -516,11 +516,12 @@ def array_strptime( except (ValueError, OutOfBoundsDatetime) as ex: ex.args = ( f"{str(ex)}, at position {i}. You might want to try:\n" - " - passing ``format='ISO8601'`` if your strings are " + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " "all ISO8601 but not necessarily in exactly the same format;\n" - " - passing ``format='mixed'``, and the format will be " + " - passing `format='mixed'`, and the format will be " "inferred for each element individually. " - "You might want to use ``dayfirst`` alongside this.", + "You might want to use `dayfirst` alongside this.", ) if is_coerce: iresult[i] = NPY_NAT diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4b4257e7aad6d..e1a5faf49d1bf 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -718,9 +718,7 @@ def to_datetime( .. warning:: ``dayfirst=True`` is not strict, but will prefer to parse - with day first. If a delimited date string cannot be parsed in - accordance with the given `dayfirst` option, e.g. - ``to_datetime(['31-12-2021'])``, then a warning will be shown. + with day first. yearfirst : bool, default False Specify a date parse order if `arg` is str or is list-like. diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index dbdfd64215d76..edae696b84bf4 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1722,7 +1722,7 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): ValueError, match=( r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' - r"at position 1. You might want to try:\n - passing ``format='ISO8601'``" + r"at position 1. You might want to try:" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 8ce04125130d5..5a929fd0e9b21 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -50,10 +50,11 @@ PARSING_ERR_MSG = ( r"You might want to try:\n" - r" - passing ``format=\'ISO8601\'`` if your strings are all ISO8601 " + r" - passing `format` if your strings have a consistent format;\n" + r" - passing `format=\'ISO8601\'` if your strings are all ISO8601 " r"but not necessarily in exactly the same format;\n" - r" - passing ``format=\'mixed\'``, and the format will be inferred " - r"for each element individually. You might want to use ``dayfirst`` " + r" - passing `format=\'mixed\'`, and the format will be inferred " + r"for each element individually. You might want to use `dayfirst` " r"alongside this." ) From acd44ae06b0ef7abb04b5a1533c3e61e920d2a04 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 10 Feb 2023 14:25:39 +0000 Subject: [PATCH 12/12] use format=mixed instead of apply in example; --- web/pandas/pdeps/0004-consistent-to-datetime-parsing.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md index 8404aff51d851..3a020aa736a5e 100644 --- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md +++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md @@ -58,11 +58,8 @@ Concretely, the suggestion is: If a user has dates in a mixed format, they can still use flexible parsing and accept the risks that poses, e.g.: ```ipython -In [3]: pd.Series(['12-01-2000 00:00:00', '13-01-2000 00:00:00']).apply(pd.to_datetime) -Out[3]: -0 2000-12-01 -1 2000-01-13 -dtype: datetime64[ns] +In [3]: pd.to_datetime(['12-01-2000 00:00:00', '13-01-2000 00:00:00'], format='mixed') +Out[3]: DatetimeIndex(['2000-12-01', '2000-01-13'], dtype='datetime64[ns]', freq=None) ``` or, if their dates are all ISO8601, ```ipython