diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ec082cb90e75c..3b06fa1b5517a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1001,14 +1001,23 @@ way to parse dates is to explicitly set ``format=``. ) df -In the case that you have mixed datetime formats within the same column, you'll need to -first read it in as an object dtype and then apply :func:`to_datetime` to each element. +In the case that you have mixed datetime formats within the same column, you can +pass ``format='mixed'`` .. ipython:: python data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") df = pd.read_csv(data) - df['date'] = df['date'].apply(pd.to_datetime) + df['date'] = pd.to_datetime(df['date'], format='mixed') + df + +or, if your datetime formats are all ISO8601 (possibly not identically-formatted): + +.. ipython:: python + + data = io.StringIO("date\n2020-01-01\n2020-01-01 03:00\n") + df = pd.read_csv(data) + df['date'] = pd.to_datetime(df['date'], format='ISO8601') df .. ipython:: python diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d1b965e64e43b..d009225f06018 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -311,6 +311,8 @@ Other enhancements - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) +- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`) +- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`) - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`) @@ -738,11 +740,16 @@ In the past, :func:`to_datetime` guessed the format for each element independent Note that this affects :func:`read_csv` as well. -If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime` -to each element individually, e.g. :: +If you still need to parse dates with inconsistent formats, you can use +``format='mixed`` (possibly alongside ``dayfirst``) :: ser = pd.Series(['13-01-2000', '12 January 2000']) - ser.apply(pd.to_datetime) + pd.to_datetime(ser, format='mixed', dayfirst=True) + +or, if your formats are all ISO8601 (but possibly not identically-formatted) :: + + ser = pd.Series(['2020-01-01', '2020-01-01 03:00']) + pd.to_datetime(ser, format='ISO8601') .. _whatsnew_200.api_breaking.other: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index bb06c65597987..cf847746f16cd 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -186,6 +186,7 @@ def array_strptime( bint iso_format = format_is_iso(fmt) NPY_DATETIMEUNIT out_bestunit int out_local = 0, out_tzoffset = 0 + bint string_to_dts_succeeded = 0 assert is_raise or is_ignore or is_coerce @@ -306,44 +307,54 @@ def array_strptime( else: val = str(val) - if iso_format: - string_to_dts_failed = string_to_dts( + if fmt == "ISO8601": + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, None, False + ) + elif iso_format: + string_to_dts_succeeded = not string_to_dts( val, &dts, &out_bestunit, &out_local, &out_tzoffset, False, fmt, exact ) - if not string_to_dts_failed: - # No error reported by string_to_dts, pick back up - # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - if out_local == 1: - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - tz = timezone(timedelta(minutes=out_tzoffset)) - result_timezone[i] = tz - out_local = 0 - out_tzoffset = 0 - iresult[i] = value - check_dts_bounds(&dts) - continue + if string_to_dts_succeeded: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + tz = timezone(timedelta(minutes=out_tzoffset)) + result_timezone[i] = tz + out_local = 0 + out_tzoffset = 0 + iresult[i] = value + check_dts_bounds(&dts) + continue if parse_today_now(val, &iresult[i], utc): continue # Some ISO formats can't be parsed by string_to_dts - # For example, 6-digit YYYYMD. So, if there's an error, - # try the string-matching code below. + # For example, 6-digit YYYYMD. So, if there's an error, and a format + # was specified, then try the string-matching code below. If the format + # specified was 'ISO8601', then we need to error, because + # only string_to_dts handles mixed ISO8601 formats. + if not string_to_dts_succeeded and fmt == "ISO8601": + raise ValueError(f"Time data {val} is not ISO8601 format") # exact matching if exact: found = format_regex.match(val) if not found: - raise ValueError(f"time data \"{val}\" doesn't " - f"match format \"{fmt}\"") + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) if len(val) != found.end(): raise ValueError( - f"unconverted data remains: " - f'"{val[found.end():]}"' + "unconverted data remains when parsing with " + f"format \"{fmt}\": \"{val[found.end():]}\"" ) # search @@ -351,8 +362,7 @@ def array_strptime( found = format_regex.search(val) if not found: raise ValueError( - f"time data \"{val}\" doesn't match " - f"format \"{fmt}\"" + f"time data \"{val}\" doesn't match format \"{fmt}\"" ) iso_year = -1 @@ -504,7 +514,15 @@ def array_strptime( result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: - ex.args = (f"{str(ex)}, at position {i}",) + ex.args = ( + f"{str(ex)}, at position {i}. You might want to try:\n" + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be " + "inferred for each element individually. " + "You might want to use `dayfirst` alongside this.", + ) if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3006bc6290ff7..b917f2de61343 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -445,7 +445,8 @@ def _convert_listlike_datetimes( if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - if format is not None: + # `format` could be inferred, or user didn't ask for mixed-format parsing. + if format is not None and format != "mixed": return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) result, tz_parsed = objects_to_datetime64ns( @@ -687,7 +688,7 @@ def to_datetime( yearfirst: bool = False, utc: bool = False, format: str | None = None, - exact: bool = True, + exact: bool | lib.NoDefault = lib.no_default, unit: str | None = None, infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", @@ -717,9 +718,7 @@ def to_datetime( .. warning:: ``dayfirst=True`` is not strict, but will prefer to parse - with day first. If a delimited date string cannot be parsed in - accordance with the given `dayfirst` option, e.g. - ``to_datetime(['31-12-2021'])``, then a warning will be shown. + with day first. yearfirst : bool, default False Specify a date parse order if `arg` is str or is list-like. @@ -759,6 +758,12 @@ def to_datetime( `_ for more information on choices, though note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. exact : bool, default True Control how `format` is used: @@ -766,6 +771,7 @@ def to_datetime( - If :const:`False`, allow the `format` to match anywhere in the target string. + Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. @@ -997,6 +1003,8 @@ def to_datetime( DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if exact is not lib.no_default and format in {"mixed", "ISO8601"}: + raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 09a2967d62fee..edae696b84bf4 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1721,7 +1721,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): with pytest.raises( ValueError, match=( - r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", at position 1$' + r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", ' + r"at position 1. You might want to try:" ), ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index afe237d35076c..71f2cae49fe41 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -48,6 +48,16 @@ from pandas.core.tools.datetimes import start_caching_at from pandas.util.version import Version +PARSING_ERR_MSG = ( + r"You might want to try:\n" + r" - passing `format` if your strings have a consistent format;\n" + r" - passing `format=\'ISO8601\'` if your strings are all ISO8601 " + r"but not necessarily in exactly the same format;\n" + r" - passing `format=\'mixed\'`, and the format will be inferred " + r"for each element individually. You might want to use `dayfirst` " + r"alongside this." +) + @pytest.fixture(params=[True, False]) def cache(request): @@ -133,7 +143,11 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser2 = ser.apply(str) ser2[2] = "nat" with pytest.raises( - ValueError, match='unconverted data remains: ".0", at position 0' + ValueError, + match=( + 'unconverted data remains when parsing with format "%Y%m%d": ".0", ' + "at position 0" + ), ): # https://github.com/pandas-dev/pandas/issues/50051 to_datetime(ser2, format="%Y%m%d", cache=cache) @@ -527,8 +541,10 @@ def test_to_datetime_parse_timezone_malformed(self, offset): msg = "|".join( [ - r'^time data ".*" doesn\'t match format ".*", at position 0$', - r'^unconverted data remains: ".*", at position 0$', + r'^time data ".*" doesn\'t match format ".*", at position 0. ' + f"{PARSING_ERR_MSG}$", + r'^unconverted data remains when parsing with format ".*": ".*", ' + f"at position 0. {PARSING_ERR_MSG}$", ] ) with pytest.raises(ValueError, match=msg): @@ -1294,7 +1310,10 @@ def test_datetime_bool_arrays_mixed(self, cache): to_datetime([False, datetime.today()], cache=cache) with pytest.raises( ValueError, - match=r'^time data "True" doesn\'t match format "%Y%m%d", at position 1$', + match=( + r'^time data "True" doesn\'t match format "%Y%m%d", ' + f"at position 1. {PARSING_ERR_MSG}$" + ), ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( @@ -1335,9 +1354,11 @@ def test_datetime_invalid_scalar(self, value, format, warning): msg = "|".join( [ - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', + r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + f"{PARSING_ERR_MSG}$", r'^Given date string "a" not likely a datetime, at position 0$', - r'^unconverted data remains: "9", at position 0$', + r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' + f"at position 0. {PARSING_ERR_MSG}$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -1360,7 +1381,7 @@ def test_datetime_outofbounds_scalar(self, value, format, warning): assert res is NaT if format is not None: - msg = r'^time data ".*" doesn\'t match format ".*", at position 0$' + msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: @@ -1387,8 +1408,10 @@ def test_datetime_invalid_index(self, values, format, warning): msg = "|".join( [ r'^Given date string "a" not likely a datetime, at position 0$', - r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0$', - r'^unconverted data remains: "9", at position 0$', + r'^time data "a" doesn\'t match format "%H:%M:%S", at position 0. ' + f"{PARSING_ERR_MSG}$", + r'^unconverted data remains when parsing with format "%H:%M:%S": "9", ' + f"at position 0. {PARSING_ERR_MSG}$", r"^second must be in 0..59: 00:01:99, at position 0$", ] ) @@ -2092,7 +2115,7 @@ def test_dataframe_coerce(self, cache): msg = ( r'^cannot assemble the datetimes: time data ".+" doesn\'t ' - r'match format "%Y%m%d", at position 1$' + r'match format "%Y%m%d", at position 1\.' ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) @@ -2169,8 +2192,8 @@ def test_dataframe_float(self, cache): # float df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) msg = ( - r"^cannot assemble the datetimes: unconverted data remains: " - r'"1", at position 0$' + r"^cannot assemble the datetimes: unconverted data remains when parsing " + r'with format ".*": "1", at position 0.' ) with pytest.raises(ValueError, match=msg): to_datetime(df, cache=cache) @@ -2252,8 +2275,10 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` msg = "|".join( [ - '^unconverted data remains: ".*", at position 0$', - 'time data ".*" doesn\'t match format ".*", at position 0', + '^unconverted data remains when parsing with format ".*": ".*"' + f", at position 0. {PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*", at position 0. ' + f"{PARSING_ERR_MSG}$", ] ) with pytest.raises( @@ -2386,7 +2411,10 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r'^time data " " doesn\'t match format "%m/%d/%Y", at position 2$' + msg = ( + r'^time data " " doesn\'t match format "%m/%d/%Y", ' + rf"at position 2. {PARSING_ERR_MSG}$" + ) with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2661,7 +2689,7 @@ def test_dayfirst_warnings_invalid_input(self): ValueError, match=( r'^time data "03/30/2011" doesn\'t match format ' - r'"%d/%m/%Y", at position 1$' + rf'"%d/%m/%Y", at position 1. {PARSING_ERR_MSG}$' ), ): to_datetime(arr, dayfirst=True) @@ -2732,7 +2760,7 @@ def test_to_datetime_inconsistent_format(self, cache): ser = Series(np.array(data)) msg = ( r'^time data "01-02-2011 00:00:00" doesn\'t match format ' - r'"%m/%d/%Y %H:%M:%S", at position 1$' + rf'"%m/%d/%Y %H:%M:%S", at position 1. {PARSING_ERR_MSG}$' ) with pytest.raises(ValueError, match=msg): to_datetime(ser, cache=cache) @@ -2861,33 +2889,34 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-29-02", "%Y-%d-%m", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-02-32", "%Y-%m-%d", - '^unconverted data remains: "2", at position 0$', + '^unconverted data remains when parsing with format "%Y-%m-%d": "2", ' + f"at position 0. {PARSING_ERR_MSG}$", ), ( "2015-32-02", "%Y-%d-%m", '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m", ' - "at position 0$", + f"at position 0. {PARSING_ERR_MSG}$", ), ( "2015-04-31", "%Y-%m-%d", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ( "2015-31-04", "%Y-%d-%m", - "^day is out of range for month, at position 0$", + f"^day is out of range for month, at position 0. {PARSING_ERR_MSG}$", ), ], ) @@ -3304,9 +3333,7 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = ( - r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0$" - ) + msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" with pytest.raises(OutOfBoundsDatetime, match=msg): with tm.assert_produces_warning(warning, match="Could not infer format"): to_datetime("2417-10-10 00:00:00", format=format) @@ -3536,3 +3563,46 @@ def test_to_datetime_format_f_parse_nanos(): nanosecond=789, ) assert result == expected + + +def test_to_datetime_mixed_iso8601(): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["2020-01-01", "2020-01-01 05:00:00"], format="ISO8601") + expected = DatetimeIndex(["2020-01-01 00:00:00", "2020-01-01 05:00:00"]) + tm.assert_index_equal(result, expected) + + +def test_to_datetime_mixed_other(): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["01/11/2000", "12 January 2000"], format="mixed") + expected = DatetimeIndex(["2000-01-11", "2000-01-12"]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("exact", [True, False]) +@pytest.mark.parametrize("format", ["ISO8601", "mixed"]) +def test_to_datetime_mixed_or_iso_exact(exact, format): + msg = "Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'" + with pytest.raises(ValueError, match=msg): + to_datetime(["2020-01-01"], exact=exact, format=format) + + +def test_to_datetime_mixed_not_necessarily_iso8601_raise(): + # https://github.com/pandas-dev/pandas/issues/50411 + with pytest.raises( + ValueError, match="Time data 01-01-2000 is not ISO8601 format, at position 1" + ): + to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601") + + +@pytest.mark.parametrize( + ("errors", "expected"), + [ + ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), + ("ignore", Index(["2020-01-01", "01-01-2000"])), + ], +) +def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): + # https://github.com/pandas-dev/pandas/issues/50411 + result = to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601", errors=errors) + tm.assert_index_equal(result, expected) diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md index 7635fabe2dbc6..3a020aa736a5e 100644 --- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md +++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md @@ -4,7 +4,7 @@ - Status: Accepted - Discussion: [#48621](https://github.com/pandas-dev/pandas/pull/48621) - Author: [Marco Gorelli](https://github.com/MarcoGorelli) -- Revision: 1 +- Revision: 2 ## Abstract @@ -58,11 +58,13 @@ Concretely, the suggestion is: If a user has dates in a mixed format, they can still use flexible parsing and accept the risks that poses, e.g.: ```ipython -In [3]: pd.Series(['12-01-2000 00:00:00', '13-01-2000 00:00:00']).apply(pd.to_datetime) -Out[3]: -0 2000-12-01 -1 2000-01-13 -dtype: datetime64[ns] +In [3]: pd.to_datetime(['12-01-2000 00:00:00', '13-01-2000 00:00:00'], format='mixed') +Out[3]: DatetimeIndex(['2000-12-01', '2000-01-13'], dtype='datetime64[ns]', freq=None) +``` +or, if their dates are all ISO8601, +```ipython +In [4]: pd.to_datetime(['2020-01-01', '2020-01-01 03:00'], format='ISO8601') +Out[4]: DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 03:00:00'], dtype='datetime64[ns]', freq=None) ``` ## Usage and Impact @@ -99,3 +101,4 @@ We could make ``guess_datetime_format`` smarter by using a random sample of elem ### PDEP History - 18 September 2022: Initial draft +- 25 January 2023: Amended to mention ``format='ISO8601'`` and ``format='mixed'`` options