Skip to content

Commit 36e4165

Browse files
Inconsistent date parsing of to_datetime (#42908)
* added warnings when parse inconsistent with dayfirst arg * improved error message * TST: added tests * removed trailing whitespaces * removed pytest.warns * wip * revert * set stacklevel, assert warning messages * okwarning in user guide * 🎨 * catch warnings * fixup * add to to_datetime docstring, add whatsnew note * wip * wip * wip * wip * fixup test * more fixups * fixup * revert to b4bb5b3 * document in timeseries.rst * add tests for read_csv * check expected_inconsistent in tests * fixup docs * remove note about dateutil bug Co-authored-by: arw2019 <[email protected]>
1 parent c17656c commit 36e4165

File tree

7 files changed

+251
-16
lines changed

7 files changed

+251
-16
lines changed

doc/source/user_guide/timeseries.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -204,16 +204,18 @@ If you use dates which start with the day first (i.e. European style),
204204
you can pass the ``dayfirst`` flag:
205205

206206
.. ipython:: python
207+
:okwarning:
207208
208209
pd.to_datetime(["04-01-2012 10:00"], dayfirst=True)
209210
210211
pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True)
211212
212213
.. warning::
213214

214-
You see in the above example that ``dayfirst`` isn't strict, so if a date
215+
You see in the above example that ``dayfirst`` isn't strict. If a date
215216
can't be parsed with the day being first it will be parsed as if
216-
``dayfirst`` were False.
217+
``dayfirst`` were False, and in the case of parsing delimited date strings
218+
(e.g. ``31-12-2012``) then a warning will also be raised.
217219

218220
If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``.
219221
``Timestamp`` can also accept string input, but it doesn't accept string parsing

doc/source/whatsnew/v1.4.0.rst

+14-3
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,20 @@ Notable bug fixes
103103

104104
These are bug fixes that might have notable behavior changes.
105105

106-
.. _whatsnew_140.notable_bug_fixes.notable_bug_fix1:
106+
.. _whatsnew_140.notable_bug_fixes.inconsistent_date_string_parsing:
107107

108-
notable_bug_fix1
109-
^^^^^^^^^^^^^^^^
108+
Inconsistent date string parsing
109+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
110+
111+
The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead to surprising behaviour:
112+
113+
.. ipython:: python
114+
:okwarning:
115+
116+
pd.to_datetime(["31-12-2021"], dayfirst=False)
117+
118+
Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when
119+
the value is a delimited date string (e.g. ``31-12-2012``).
110120

111121
.. _whatsnew_140.notable_bug_fixes.notable_bug_fix2:
112122

@@ -253,6 +263,7 @@ Categorical
253263
Datetimelike
254264
^^^^^^^^^^^^
255265
- Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`)
266+
- :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`)
256267
-
257268

258269
Timedelta

pandas/_libs/tslibs/parsing.pyx

+24
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ Parsing functions for datetime and datetime-like strings.
33
"""
44
import re
55
import time
6+
import warnings
67

78
from libc.string cimport strchr
89

@@ -81,6 +82,11 @@ class DateParseError(ValueError):
8182
_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
8283
second=0, microsecond=0)
8384

85+
PARSING_WARNING_MSG = (
86+
"Parsing '{date_string}' in {format} format. Provide format "
87+
"or specify infer_datetime_format=True for consistent parsing."
88+
)
89+
8490
cdef:
8591
set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'}
8692

@@ -168,10 +174,28 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
168174
# date_string can't be converted to date, above format
169175
return None, None
170176

177+
swapped_day_and_month = False
171178
if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
172179
and (month <= MAX_MONTH or day <= MAX_MONTH):
173180
if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
174181
day, month = month, day
182+
swapped_day_and_month = True
183+
if dayfirst and not swapped_day_and_month:
184+
warnings.warn(
185+
PARSING_WARNING_MSG.format(
186+
date_string=date_string,
187+
format='MM/DD/YYYY'
188+
),
189+
stacklevel=4,
190+
)
191+
elif not dayfirst and swapped_day_and_month:
192+
warnings.warn(
193+
PARSING_WARNING_MSG.format(
194+
date_string=date_string,
195+
format='DD/MM/YYYY'
196+
),
197+
stacklevel=4,
198+
)
175199
if PY_VERSION_HEX >= 0x03060100:
176200
# In Python <= 3.6.0 there is no range checking for invalid dates
177201
# in C api, thus we call faster C version for 3.6.1 or newer

pandas/core/tools/datetimes.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -701,8 +701,14 @@ def to_datetime(
701701
Specify a date parse order if `arg` is str or its list-likes.
702702
If True, parses dates with the day first, eg 10/11/12 is parsed as
703703
2012-11-10.
704-
Warning: dayfirst=True is not strict, but will prefer to parse
705-
with day first (this is a known bug, based on dateutil behavior).
704+
705+
.. warning::
706+
707+
dayfirst=True is not strict, but will prefer to parse
708+
with day first. If a delimited date string cannot be parsed in
709+
accordance with the given `dayfirst` option, e.g.
710+
``to_datetime(['31-12-2021'])``, then a warning will be shown.
711+
706712
yearfirst : bool, default False
707713
Specify a date parse order if `arg` is str or its list-likes.
708714
@@ -711,8 +717,11 @@ def to_datetime(
711717
- If both dayfirst and yearfirst are True, yearfirst is preceded (same
712718
as dateutil).
713719
714-
Warning: yearfirst=True is not strict, but will prefer to parse
715-
with year first (this is a known bug, based on dateutil behavior).
720+
.. warning::
721+
722+
yearfirst=True is not strict, but will prefer to parse
723+
with year first.
724+
716725
utc : bool, default None
717726
Return UTC DatetimeIndex if True (converting any tz-aware
718727
datetime.datetime objects as well).

pandas/tests/io/parser/test_parse_dates.py

+126-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
datetime,
99
)
1010
from io import StringIO
11+
import warnings
1112

1213
from dateutil.parser import parse as du_parse
1314
from hypothesis import (
@@ -39,6 +40,7 @@
3940
from pandas.core.indexes.datetimes import date_range
4041

4142
import pandas.io.date_converters as conv
43+
from pandas.io.parsers import read_csv
4244

4345
# constant
4446
_DEFAULT_DATETIME = datetime(1, 1, 1)
@@ -1556,16 +1558,16 @@ def test_invalid_parse_delimited_date(all_parsers, date_string):
15561558
"date_string,dayfirst,expected",
15571559
[
15581560
# %d/%m/%Y; month > 12 thus replacement
1559-
("13/02/2019", False, datetime(2019, 2, 13)),
15601561
("13/02/2019", True, datetime(2019, 2, 13)),
15611562
# %m/%d/%Y; day > 12 thus there will be no replacement
15621563
("02/13/2019", False, datetime(2019, 2, 13)),
1563-
("02/13/2019", True, datetime(2019, 2, 13)),
15641564
# %d/%m/%Y; dayfirst==True thus replacement
15651565
("04/02/2019", True, datetime(2019, 2, 4)),
15661566
],
15671567
)
1568-
def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected):
1568+
def test_parse_delimited_date_swap_no_warning(
1569+
all_parsers, date_string, dayfirst, expected
1570+
):
15691571
parser = all_parsers
15701572
expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
15711573
result = parser.read_csv(
@@ -1574,6 +1576,30 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected)
15741576
tm.assert_frame_equal(result, expected)
15751577

15761578

1579+
@pytest.mark.parametrize(
1580+
"date_string,dayfirst,expected",
1581+
[
1582+
# %d/%m/%Y; month > 12 thus replacement
1583+
("13/02/2019", False, datetime(2019, 2, 13)),
1584+
# %m/%d/%Y; day > 12 thus there will be no replacement
1585+
("02/13/2019", True, datetime(2019, 2, 13)),
1586+
],
1587+
)
1588+
def test_parse_delimited_date_swap_with_warning(
1589+
all_parsers, date_string, dayfirst, expected
1590+
):
1591+
parser = all_parsers
1592+
expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
1593+
warning_msg = (
1594+
"Provide format or specify infer_datetime_format=True for consistent parsing"
1595+
)
1596+
with tm.assert_produces_warning(UserWarning, match=warning_msg):
1597+
result = parser.read_csv(
1598+
StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0]
1599+
)
1600+
tm.assert_frame_equal(result, expected)
1601+
1602+
15771603
def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
15781604
msg, result = None, None
15791605
try:
@@ -1602,9 +1628,11 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
16021628
except_in_dateutil, except_out_dateutil = None, None
16031629
date_string = test_datetime.strftime(date_format.replace(" ", delimiter))
16041630

1605-
except_out_dateutil, result = _helper_hypothesis_delimited_date(
1606-
parse_datetime_string, date_string, dayfirst=dayfirst
1607-
)
1631+
with warnings.catch_warnings():
1632+
warnings.filterwarnings("ignore", category=UserWarning)
1633+
except_out_dateutil, result = _helper_hypothesis_delimited_date(
1634+
parse_datetime_string, date_string, dayfirst=dayfirst
1635+
)
16081636
except_in_dateutil, expected = _helper_hypothesis_delimited_date(
16091637
du_parse,
16101638
date_string,
@@ -1674,3 +1702,95 @@ def test_date_parser_usecols_thousands(all_parsers):
16741702
)
16751703
expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2})
16761704
tm.assert_frame_equal(result, expected)
1705+
1706+
1707+
def test_dayfirst_warnings():
1708+
# GH 12585
1709+
warning_msg_day_first = (
1710+
"Parsing '31/12/2014' in DD/MM/YYYY format. Provide "
1711+
"format or specify infer_datetime_format=True for consistent parsing."
1712+
)
1713+
warning_msg_month_first = (
1714+
"Parsing '03/30/2011' in MM/DD/YYYY format. Provide "
1715+
"format or specify infer_datetime_format=True for consistent parsing."
1716+
)
1717+
1718+
# CASE 1: valid input
1719+
input = "date\n31/12/2014\n10/03/2011"
1720+
expected_consistent = DatetimeIndex(
1721+
["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date"
1722+
)
1723+
expected_inconsistent = DatetimeIndex(
1724+
["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None, name="date"
1725+
)
1726+
1727+
# A. dayfirst arg correct, no warning
1728+
res1 = read_csv(
1729+
StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
1730+
).index
1731+
tm.assert_index_equal(expected_consistent, res1)
1732+
1733+
# B. dayfirst arg incorrect, warning + incorrect output
1734+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1735+
res2 = read_csv(
1736+
StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
1737+
).index
1738+
tm.assert_index_equal(expected_inconsistent, res2)
1739+
1740+
# C. dayfirst default arg, same as B
1741+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1742+
res3 = read_csv(
1743+
StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
1744+
).index
1745+
tm.assert_index_equal(expected_inconsistent, res3)
1746+
1747+
# D. infer_datetime_format=True overrides dayfirst default
1748+
# no warning + correct result
1749+
res4 = read_csv(
1750+
StringIO(input),
1751+
parse_dates=["date"],
1752+
infer_datetime_format=True,
1753+
index_col="date",
1754+
).index
1755+
tm.assert_index_equal(expected_consistent, res4)
1756+
1757+
# CASE 2: invalid input
1758+
# cannot consistently process with single format
1759+
# warnings *always* raised
1760+
1761+
# first in DD/MM/YYYY, second in MM/DD/YYYY
1762+
input = "date\n31/12/2014\n03/30/2011"
1763+
expected = DatetimeIndex(
1764+
["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None, name="date"
1765+
)
1766+
1767+
# A. use dayfirst=True
1768+
with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first):
1769+
res5 = read_csv(
1770+
StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date"
1771+
).index
1772+
tm.assert_index_equal(expected, res5)
1773+
1774+
# B. use dayfirst=False
1775+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1776+
res6 = read_csv(
1777+
StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
1778+
).index
1779+
tm.assert_index_equal(expected, res6)
1780+
1781+
# C. use dayfirst default arg, same as B
1782+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1783+
res7 = read_csv(
1784+
StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date"
1785+
).index
1786+
tm.assert_index_equal(expected, res7)
1787+
1788+
# D. use infer_datetime_format=True
1789+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1790+
res8 = read_csv(
1791+
StringIO(input),
1792+
parse_dates=["date"],
1793+
infer_datetime_format=True,
1794+
index_col="date",
1795+
).index
1796+
tm.assert_index_equal(expected, res8)

pandas/tests/scalar/period/test_period.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ def test_to_timestamp_tz_arg(self, tzstr):
572572

573573
with tm.assert_produces_warning(FutureWarning):
574574
p = Period("1/1/2005", freq="A").to_timestamp(freq="A", tz=tzstr)
575-
exp = Timestamp("31/12/2005", tz="UTC").tz_convert(tzstr)
575+
exp = Timestamp(day=31, month=12, year=2005, tz="UTC").tz_convert(tzstr)
576576
exp_zone = pytz.timezone(tzstr).normalize(p)
577577

578578
assert p == exp

pandas/tests/tools/test_to_datetime.py

+69
Original file line numberDiff line numberDiff line change
@@ -1839,6 +1839,75 @@ def test_dayfirst(self, cache):
18391839
tm.assert_index_equal(expected, idx5)
18401840
tm.assert_index_equal(expected, idx6)
18411841

1842+
def test_dayfirst_warnings(self):
1843+
# GH 12585
1844+
warning_msg_day_first = (
1845+
"Parsing '31/12/2014' in DD/MM/YYYY format. Provide "
1846+
"format or specify infer_datetime_format=True for consistent parsing."
1847+
)
1848+
warning_msg_month_first = (
1849+
"Parsing '03/30/2011' in MM/DD/YYYY format. Provide "
1850+
"format or specify infer_datetime_format=True for consistent parsing."
1851+
)
1852+
1853+
# CASE 1: valid input
1854+
arr = ["31/12/2014", "10/03/2011"]
1855+
expected_consistent = DatetimeIndex(
1856+
["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None
1857+
)
1858+
expected_inconsistent = DatetimeIndex(
1859+
["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None
1860+
)
1861+
1862+
# A. dayfirst arg correct, no warning
1863+
res1 = to_datetime(arr, dayfirst=True)
1864+
tm.assert_index_equal(expected_consistent, res1)
1865+
1866+
# B. dayfirst arg incorrect, warning + incorrect output
1867+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1868+
res2 = to_datetime(arr, dayfirst=False)
1869+
tm.assert_index_equal(expected_inconsistent, res2)
1870+
1871+
# C. dayfirst default arg, same as B
1872+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1873+
res3 = to_datetime(arr, dayfirst=False)
1874+
tm.assert_index_equal(expected_inconsistent, res3)
1875+
1876+
# D. infer_datetime_format=True overrides dayfirst default
1877+
# no warning + correct result
1878+
res4 = to_datetime(arr, infer_datetime_format=True)
1879+
tm.assert_index_equal(expected_consistent, res4)
1880+
1881+
# CASE 2: invalid input
1882+
# cannot consistently process with single format
1883+
# warnings *always* raised
1884+
1885+
arr = ["31/12/2014", "03/30/2011"]
1886+
# first in DD/MM/YYYY, second in MM/DD/YYYY
1887+
expected = DatetimeIndex(
1888+
["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None
1889+
)
1890+
1891+
# A. use dayfirst=True
1892+
with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first):
1893+
res5 = to_datetime(arr, dayfirst=True)
1894+
tm.assert_index_equal(expected, res5)
1895+
1896+
# B. use dayfirst=False
1897+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1898+
res6 = to_datetime(arr, dayfirst=False)
1899+
tm.assert_index_equal(expected, res6)
1900+
1901+
# C. use dayfirst default arg, same as B
1902+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1903+
res7 = to_datetime(arr, dayfirst=False)
1904+
tm.assert_index_equal(expected, res7)
1905+
1906+
# D. use infer_datetime_format=True
1907+
with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first):
1908+
res8 = to_datetime(arr, infer_datetime_format=True)
1909+
tm.assert_index_equal(expected, res8)
1910+
18421911
@pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray])
18431912
def test_to_datetime_dta_tz(self, klass):
18441913
# GH#27733

0 commit comments

Comments
 (0)