Skip to content

Commit e01b6ee

Browse files
author
MarcoGorelli
committed
allow format="mixed"
1 parent 262be89 commit e01b6ee

File tree

8 files changed

+97
-99
lines changed

8 files changed

+97
-99
lines changed

doc/source/user_guide/io.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -1001,17 +1001,17 @@ way to parse dates is to explicitly set ``format=``.
10011001
)
10021002
df
10031003
1004-
In the case that you have mixed datetime formats within the same column, you'll need to
1005-
first read it in as an object dtype and then apply :func:`to_datetime` to each element.
1004+
In the case that you have mixed datetime formats within the same column, you can
1005+
pass ``format='mixed'``
10061006

10071007
.. ipython:: python
10081008
10091009
data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n")
10101010
df = pd.read_csv(data)
1011-
df['date'] = df['date'].apply(pd.to_datetime)
1011+
df['date'] = pd.to_datetime(df['date'], format='mixed')
10121012
df
10131013
1014-
or, if your datetime formats are all ISO8601 (but possibly not identically-formatted):
1014+
or, if your datetime formats are all ISO8601 (possibly not identically-formatted):
10151015

10161016
.. ipython:: python
10171017

doc/source/whatsnew/v2.0.0.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ Other enhancements
188188
- Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
189189
- Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
190190
- :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`)
191+
- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`)
191192
-
192193

193194
.. ---------------------------------------------------------------------------
@@ -573,11 +574,11 @@ In the past, :func:`to_datetime` guessed the format for each element independent
573574
574575
Note that this affects :func:`read_csv` as well.
575576

576-
If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime`
577-
to each element individually, e.g. ::
577+
If you still need to parse dates with inconsistent formats, you can use
578+
``format='mixed`` (preferably alongside ``dayfirst``) ::
578579

579580
ser = pd.Series(['13-01-2000', '12 January 2000'])
580-
ser.apply(pd.to_datetime)
581+
pd.to_datetime(ser, format='mixed', dayfirst=True)
581582

582583
or, if your formats are all ISO8601 (but possibly not identically-formatted) ::
583584

pandas/_libs/tslibs/strptime.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ from pandas._typing import npt
55
def array_strptime(
66
values: npt.NDArray[np.object_],
77
fmt: str | None,
8-
fmt_inferred: bool = ...,
98
exact: bool = ...,
109
errors: str = ...,
1110
utc: bool = ...,

pandas/_libs/tslibs/strptime.pyx

+18-35
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ cdef dict _parse_code_table = {"y": 0,
152152
def array_strptime(
153153
ndarray[object] values,
154154
str fmt,
155-
bint fmt_inferred=False,
156155
bint exact=True,
157156
errors="raise",
158157
bint utc=False,
@@ -349,40 +348,22 @@ def array_strptime(
349348
if exact:
350349
found = format_regex.match(val)
351350
if not found:
352-
if fmt_inferred:
353-
raise ValueError(
354-
f"time data \"{val}\" doesn't "
355-
f"match (inferred) format \"{fmt}\""
356-
)
357-
else:
358-
raise ValueError(
359-
f"time data \"{val}\" doesn't match format \"{fmt}\""
360-
)
351+
raise ValueError(
352+
f"time data \"{val}\" doesn't match format \"{fmt}\""
353+
)
361354
if len(val) != found.end():
362-
if fmt_inferred:
363-
raise ValueError(
364-
"unconverted data remains when parsing with "
365-
f"(inferred) format \"{fmt}\": \"{val[found.end():]}\""
366-
)
367-
else:
368-
raise ValueError(
369-
"unconverted data remains when parsing with "
370-
f"format \"{fmt}\": \"{val[found.end():]}\""
371-
)
355+
raise ValueError(
356+
"unconverted data remains when parsing with "
357+
f"format \"{fmt}\": \"{val[found.end():]}\""
358+
)
372359

373360
# search
374361
else:
375362
found = format_regex.search(val)
376363
if not found:
377-
if fmt_inferred:
378-
raise ValueError(
379-
f"time data \"{val}\" doesn't match "
380-
f"(inferred) format \"{fmt}\""
381-
)
382-
else:
383-
raise ValueError(
384-
f"time data \"{val}\" doesn't match format \"{fmt}\""
385-
)
364+
raise ValueError(
365+
f"time data \"{val}\" doesn't match format \"{fmt}\""
366+
)
386367

387368
iso_year = -1
388369
year = 1900
@@ -533,12 +514,14 @@ def array_strptime(
533514
result_timezone[i] = tz
534515

535516
except (ValueError, OutOfBoundsDatetime) as ex:
536-
if iso_format:
537-
ex.args = (f"{str(ex)}, at position {i}. If your time strings "
538-
"are all (not-necessarily-identically-formatted) ISO8601, "
539-
"you could try passing 'format=\"ISO8601\"'",)
540-
else:
541-
ex.args = (f"{str(ex)}, at position {i}",)
517+
ex.args = (
518+
f"{str(ex)}, at position {i}. You might want to try:\n"
519+
" - passing ``format='ISO8601'`` if your strings are "
520+
"all ISO8601 but not necessarily in exactly the same format;\n"
521+
" - passing ``format='mixed'``, and the format will be "
522+
"inferred for each element individually. "
523+
"You might want to use ``dayfirst`` alongside this.",
524+
)
542525
if is_coerce:
543526
iresult[i] = NPY_NAT
544527
continue

pandas/core/tools/datetimes.py

+15-14
Original file line numberDiff line numberDiff line change
@@ -442,15 +442,12 @@ def _convert_listlike_datetimes(
442442

443443
arg = ensure_object(arg)
444444

445-
format_inferred = False
446-
if format is None:
445+
if format is None and format != "mixed":
447446
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
448-
format_inferred = True
449447

450-
if format is not None:
451-
return _array_strptime_with_fallback(
452-
arg, name, utc, format, format_inferred, exact, errors
453-
)
448+
# `format` could not be inferred, or user asked for mixed-format parsing.
449+
if format is not None and format != "mixed":
450+
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
454451

455452
result, tz_parsed = objects_to_datetime64ns(
456453
arg,
@@ -475,16 +472,13 @@ def _array_strptime_with_fallback(
475472
name,
476473
utc: bool,
477474
fmt: str,
478-
fmt_inferred: bool,
479475
exact: bool,
480476
errors: str,
481477
) -> Index:
482478
"""
483479
Call array_strptime, with fallback behavior depending on 'errors'.
484480
"""
485-
result, timezones = array_strptime(
486-
arg, fmt, fmt_inferred=fmt_inferred, exact=exact, errors=errors, utc=utc
487-
)
481+
result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
488482
if any(tz is not None for tz in timezones):
489483
return _return_parsed_timezone_results(result, timezones, utc, name)
490484

@@ -694,7 +688,7 @@ def to_datetime(
694688
yearfirst: bool = False,
695689
utc: bool = False,
696690
format: str | None = None,
697-
exact: bool = True,
691+
exact: bool | lib.NoDefault = lib.no_default,
698692
unit: str | None = None,
699693
infer_datetime_format: lib.NoDefault | bool = lib.no_default,
700694
origin: str = "unix",
@@ -766,15 +760,20 @@ def to_datetime(
766760
<https://docs.python.org/3/library/datetime.html
767761
#strftime-and-strptime-behavior>`_ for more information on choices, though
768762
note that :const:`"%f"` will parse all the way up to nanoseconds.
769-
You can also pass "ISO8601" to parse any ISO8601 time string.
763+
You can also pass:
764+
765+
- "ISO8601", to parse any ISO8601 time string (not necessarily in exactly the
766+
same format);
767+
- "mixed", to infer the format for each element individually. This is risky,
768+
and you should probably use it along with `dayfirst`.
770769
exact : bool, default True
771770
Control how `format` is used:
772771
773772
- If :const:`True`, require an exact `format` match.
774773
- If :const:`False`, allow the `format` to match anywhere in the target
775774
string.
776775
777-
Note that if ``format='ISO8601'`` then `exact` has no effect.
776+
Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
778777
unit : str, default 'ns'
779778
The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
780779
integer or float number. This will be based off the origin.
@@ -1006,6 +1005,8 @@ def to_datetime(
10061005
DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
10071006
dtype='datetime64[ns, UTC]', freq=None)
10081007
"""
1008+
if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
1009+
raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
10091010
if infer_datetime_format is not lib.no_default:
10101011
warnings.warn(
10111012
"The argument 'infer_datetime_format' is deprecated and will "

pandas/tests/io/parser/test_parse_dates.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1719,8 +1719,8 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
17191719
with pytest.raises(
17201720
ValueError,
17211721
match=(
1722-
r'^time data "31/05/2000" doesn\'t match \(inferred\) format "%m/%d/%Y", '
1723-
r"at position 1$"
1722+
r'^time data "31/05/2000" doesn\'t match format "%m/%d/%Y", '
1723+
r"at position 1. You might want to try:\n - passing ``format='ISO8601'``"
17241724
),
17251725
):
17261726
pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"])

0 commit comments

Comments
 (0)