Skip to content

Commit 3da20e6

Browse files
author
MarcoGorelli
committed
pdep4
1 parent 2f7dce4 commit 3da20e6

File tree

19 files changed

+522
-505
lines changed

19 files changed

+522
-505
lines changed

doc/source/user_guide/io.rst

+6-25
Original file line numberDiff line numberDiff line change
@@ -1009,41 +1009,22 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie
10091009
Inferring datetime format
10101010
+++++++++++++++++++++++++
10111011

1012-
If you have ``parse_dates`` enabled for some or all of your columns, and your
1013-
datetime strings are all formatted the same way, you may get a large speed
1014-
up by setting ``infer_datetime_format=True``. If set, pandas will attempt
1015-
to guess the format of your datetime strings, and then use a faster means
1016-
of parsing the strings. 5-10x parsing speeds have been observed. pandas
1017-
will fallback to the usual parsing if either the format cannot be guessed
1018-
or the format that was guessed cannot properly parse the entire column
1019-
of strings. So in general, ``infer_datetime_format`` should not have any
1020-
negative consequences if enabled.
1021-
1022-
Here are some examples of datetime strings that can be guessed (All
1023-
representing December 30th, 2011 at 00:00:00):
1024-
1025-
* "20111230"
1026-
* "2011/12/30"
1027-
* "20111230 00:00:00"
1028-
* "12/30/2011 00:00:00"
1029-
* "30/Dec/2011 00:00:00"
1030-
* "30/December/2011 00:00:00"
1031-
1032-
Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With
1033-
``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With
1034-
``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th.
1012+
If you try to parse a column of date strings, pandas will attempt to guess the format
1013+
from the first non-NaN element, and will then parse the rest of the column with that
1014+
format.
10351015

10361016
.. ipython:: python
10371017
1038-
# Try to infer the format for the index column
10391018
df = pd.read_csv(
10401019
"foo.csv",
10411020
index_col=0,
10421021
parse_dates=True,
1043-
infer_datetime_format=True,
10441022
)
10451023
df
10461024
1025+
In the case that you have mixed datetime formats within the same column, you'll need to
1026+
first read it in the file, and then apply :func:`to_datetime` to each element.
1027+
10471028
.. ipython:: python
10481029
:suppress:
10491030

doc/source/whatsnew/v2.0.0.rst

+33
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,39 @@ Optional libraries below the lowest tested version may still work, but are not c
112112

113113
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
114114

115+
Datetimes are now parsed with a consistent format
116+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
117+
118+
:func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value
119+
(unless ``format`` is specified). Previously, it would've guessed the format for each element individually.
120+
121+
*Old behavior*:
122+
123+
.. code-block:: ipython
124+
125+
In [1]: ser = pd.Series(['13-01-2000', '12-01-2000'])
126+
In [2]: pd.to_datetime(ser)
127+
Out[2]:
128+
0 2000-01-13
129+
1 2000-12-01
130+
dtype: datetime64[ns]
131+
132+
*New behavior*:
133+
134+
.. ipython:: python
135+
:okwarning:
136+
137+
ser = pd.Series(['13-01-2000', '12-01-2000'])
138+
pd.to_datetime(ser)
139+
140+
Note that this affects :func:`read_csv` as well.
141+
142+
If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime`
143+
to each element individually, e.g. ::
144+
145+
ser = pd.Series(['13-01-2000', '12 January 2000'])
146+
ser.apply(pd.to_datetime)
147+
115148
.. _whatsnew_200.api_breaking.other:
116149

117150
Other API changes

pandas/_libs/tslibs/parsing.pyx

+20
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
10881088
# rebuild string, capturing any inferred padding
10891089
dt_str = ''.join(tokens)
10901090
if parsed_datetime.strftime(guessed_format) == dt_str:
1091+
_maybe_warn_about_dayfirst(guessed_format, dayfirst)
10911092
return guessed_format
10921093
else:
10931094
return None
@@ -1106,6 +1107,25 @@ cdef str _fill_token(token: str, padding: int):
11061107
token_filled = f'{seconds}.{nanoseconds}'
11071108
return token_filled
11081109

1110+
cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst):
1111+
cdef:
1112+
int day_index = format.find('%d')
1113+
int month_index = format.find('%m')
1114+
1115+
if (day_index != -1) and (month_index != -1):
1116+
if (day_index > month_index) and dayfirst:
1117+
warnings.warn(
1118+
f"Parsing dates in {format} format when dayfirst=True was specified. "
1119+
f"Pass `dayfirst=False` or specify a format to silence this warning.",
1120+
stacklevel=find_stack_level(),
1121+
)
1122+
if (day_index < month_index) and not dayfirst:
1123+
warnings.warn(
1124+
f"Parsing dates in {format} format when dayfirst=False was specified. "
1125+
f"Pass `dayfirst=True` or specify a format to silence this warning.",
1126+
stacklevel=find_stack_level(),
1127+
)
1128+
11091129
@cython.wraparound(False)
11101130
@cython.boundscheck(False)
11111131
cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers):

pandas/core/tools/datetimes.py

+44-60
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
import numpy as np
2121

22-
from pandas._libs import tslib
22+
from pandas._libs import (
23+
lib,
24+
tslib,
25+
)
2326
from pandas._libs.tslibs import (
2427
OutOfBoundsDatetime,
2528
Timedelta,
@@ -129,7 +132,16 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str
129132
if (first_non_null := tslib.first_non_null(arr)) != -1:
130133
if type(first_non_nan_element := arr[first_non_null]) is str:
131134
# GH#32264 np.str_ object
132-
return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst)
135+
guessed_format = guess_datetime_format(
136+
first_non_nan_element, dayfirst=dayfirst
137+
)
138+
if guessed_format is not None:
139+
return guessed_format
140+
warnings.warn(
141+
"Could not infer format - "
142+
"to ensure consistent parsing, specify a format.",
143+
stacklevel=find_stack_level(),
144+
)
133145
return None
134146

135147

@@ -331,7 +343,6 @@ def _convert_listlike_datetimes(
331343
tz: Timezone | None = None,
332344
unit: str | None = None,
333345
errors: DateTimeErrorChoices = "raise",
334-
infer_datetime_format: bool = False,
335346
dayfirst: bool | None = None,
336347
yearfirst: bool | None = None,
337348
exact: bool = True,
@@ -415,27 +426,19 @@ def _convert_listlike_datetimes(
415426
arg = ensure_object(arg)
416427
require_iso8601 = False
417428

418-
if infer_datetime_format and format is None:
429+
if format is None:
419430
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
420431

432+
# There is a special fast-path for iso8601 formatted
433+
# datetime strings, so in those cases don't use the inferred
434+
# format because this path makes process slower in this
435+
# special case
436+
if format is not None and format_is_iso(format):
437+
require_iso8601 = True
438+
format = None
421439
if format is not None:
422-
# There is a special fast-path for iso8601 formatted
423-
# datetime strings, so in those cases don't use the inferred
424-
# format because this path makes process slower in this
425-
# special case
426-
format_is_iso8601 = format_is_iso(format)
427-
if format_is_iso8601:
428-
require_iso8601 = not infer_datetime_format
429-
format = None
430-
431-
if format is not None:
432-
res = _to_datetime_with_format(
433-
arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format
434-
)
435-
if res is not None:
436-
return res
440+
return _to_datetime_with_format(arg, orig_arg, name, tz, format, exact, errors)
437441

438-
assert format is None or infer_datetime_format
439442
utc = tz == "utc"
440443
result, tz_parsed = objects_to_datetime64ns(
441444
arg,
@@ -464,8 +467,7 @@ def _array_strptime_with_fallback(
464467
fmt: str,
465468
exact: bool,
466469
errors: str,
467-
infer_datetime_format: bool,
468-
) -> Index | None:
470+
) -> Index:
469471
"""
470472
Call array_strptime, with fallback behavior depending on 'errors'.
471473
"""
@@ -486,18 +488,14 @@ def _array_strptime_with_fallback(
486488
# if fmt was inferred, try falling back
487489
# to array_to_datetime - terminate here
488490
# for specified formats
489-
if not infer_datetime_format:
490-
if errors == "raise":
491-
raise
492-
elif errors == "coerce":
493-
result = np.empty(arg.shape, dtype="M8[ns]")
494-
iresult = result.view("i8")
495-
iresult.fill(iNaT)
496-
else:
497-
result = arg
491+
if errors == "raise":
492+
raise
493+
elif errors == "coerce":
494+
result = np.empty(arg.shape, dtype="M8[ns]")
495+
iresult = result.view("i8")
496+
iresult.fill(iNaT)
498497
else:
499-
# Indicates to the caller to fallback to objects_to_datetime64ns
500-
return None
498+
result = arg
501499
else:
502500
if "%Z" in fmt or "%z" in fmt:
503501
return _return_parsed_timezone_results(result, timezones, tz, name)
@@ -513,10 +511,9 @@ def _to_datetime_with_format(
513511
fmt: str,
514512
exact: bool,
515513
errors: str,
516-
infer_datetime_format: bool,
517-
) -> Index | None:
514+
) -> Index:
518515
"""
519-
Try parsing with the given format, returning None on failure.
516+
Try parsing with the given format.
520517
"""
521518
result = None
522519

@@ -537,9 +534,7 @@ def _to_datetime_with_format(
537534
return _box_as_indexlike(result, utc=utc, name=name)
538535

539536
# fallback
540-
res = _array_strptime_with_fallback(
541-
arg, name, tz, fmt, exact, errors, infer_datetime_format
542-
)
537+
res = _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors)
543538
return res
544539

545540

@@ -713,7 +708,7 @@ def to_datetime(
713708
format: str | None = None,
714709
exact: bool = True,
715710
unit: str | None = None,
716-
infer_datetime_format: bool = False,
711+
infer_datetime_format: lib.NoDefault | bool = lib.no_default,
717712
origin: str = "unix",
718713
cache: bool = True,
719714
) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
@@ -926,24 +921,6 @@ def to_datetime(
926921
1 2016-03-05
927922
dtype: datetime64[ns]
928923
929-
Passing ``infer_datetime_format=True`` can often-times speedup a parsing
930-
if its not an ISO8601 format exactly, but in a regular format.
931-
932-
>>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000)
933-
>>> s.head()
934-
0 3/11/2000
935-
1 3/12/2000
936-
2 3/13/2000
937-
3 3/11/2000
938-
4 3/12/2000
939-
dtype: object
940-
941-
>>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP
942-
100 loops, best of 3: 10.4 ms per loop
943-
944-
>>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP
945-
1 loop, best of 3: 471 ms per loop
946-
947924
Using a unix epoch time
948925
949926
>>> pd.to_datetime(1490195805, unit='s')
@@ -1025,7 +1002,7 @@ def to_datetime(
10251002
are constant:
10261003
10271004
>>> from datetime import datetime
1028-
>>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)])
1005+
>>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
10291006
DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'],
10301007
dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None)
10311008
@@ -1060,6 +1037,14 @@ def to_datetime(
10601037
'2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'],
10611038
dtype='datetime64[ns, UTC]', freq=None)
10621039
"""
1040+
if infer_datetime_format is not lib.no_default:
1041+
# Kept for compatibility with old code - TODO remove
1042+
warnings.warn(
1043+
"The argument 'infer_datetime_format' has been removed - a strict version "
1044+
"of it is now the default, see "
1045+
"https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html",
1046+
stacklevel=find_stack_level(),
1047+
)
10631048
if arg is None:
10641049
return None
10651050

@@ -1075,7 +1060,6 @@ def to_datetime(
10751060
yearfirst=yearfirst,
10761061
errors=errors,
10771062
exact=exact,
1078-
infer_datetime_format=infer_datetime_format,
10791063
)
10801064

10811065
result: Timestamp | NaTType | Series | Index

pandas/io/parsers/base_parser.py

-5
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,11 @@ def __init__(self, kwds) -> None:
122122
self.true_values = kwds.get("true_values")
123123
self.false_values = kwds.get("false_values")
124124
self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
125-
self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
126125
self.cache_dates = kwds.pop("cache_dates", True)
127126

128127
self._date_conv = _make_date_converter(
129128
date_parser=self.date_parser,
130129
dayfirst=self.dayfirst,
131-
infer_datetime_format=self.infer_datetime_format,
132130
cache_dates=self.cache_dates,
133131
)
134132

@@ -1105,7 +1103,6 @@ def _get_empty_meta(
11051103
def _make_date_converter(
11061104
date_parser=None,
11071105
dayfirst: bool = False,
1108-
infer_datetime_format: bool = False,
11091106
cache_dates: bool = True,
11101107
):
11111108
def converter(*date_cols):
@@ -1118,7 +1115,6 @@ def converter(*date_cols):
11181115
utc=None,
11191116
dayfirst=dayfirst,
11201117
errors="ignore",
1121-
infer_datetime_format=infer_datetime_format,
11221118
cache=cache_dates,
11231119
).to_numpy()
11241120

@@ -1188,7 +1184,6 @@ def converter(*date_cols):
11881184
"squeeze": None,
11891185
"compression": None,
11901186
"mangle_dupe_cols": True,
1191-
"infer_datetime_format": False,
11921187
"skip_blank_lines": True,
11931188
"encoding_errors": "strict",
11941189
"on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,

0 commit comments

Comments
 (0)