From 33c96e264858ffd4f65f6f0931c30dcb78d7ffe2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 13:01:55 +0200 Subject: [PATCH 1/9] try improving --- pandas/_libs/tslibs/parsing.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8b42ed195957b..fe9693865ca31 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -85,8 +85,9 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) PARSING_WARNING_MSG = ( - "Parsing '{date_string}' in {format} format. Provide format " - "or specify infer_datetime_format=True for consistent parsing." + "Parsing dates in {format} format when dayfirst={dayfirst} was specified. " + "This may lead to inconsistently-parsed dates! Specify a format " + "for consistent parsing." ) cdef: @@ -186,7 +187,8 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): warnings.warn( PARSING_WARNING_MSG.format( date_string=date_string, - format='MM/DD/YYYY' + format='MM/DD/YYYY', + dayfirst='True', ), stacklevel=4, ) @@ -195,6 +197,7 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): PARSING_WARNING_MSG.format( date_string=date_string, format='DD/MM/YYYY' + dayfirst='False', ), stacklevel=4, ) From 7d06940203b2bfe789329ddc635bd1c48cd0d463 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 14:06:10 +0200 Subject: [PATCH 2/9] WARN: clarify warning message from to_datetime when dayfirst cant be respected --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/tslibs/parsing.pyx | 6 +++--- pandas/tests/io/parser/test_parse_dates.py | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 56b09348189ee..b1c75acee550f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -727,6 +727,7 @@ Other Deprecations - Deprecated the ``display.column_space`` global configuration option (:issue:`7576`) - Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) +- Clarified warning from :meth:`pandas.to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index fe9693865ca31..a506bcdee2530 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -87,7 +87,7 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, PARSING_WARNING_MSG = ( "Parsing dates in {format} format when dayfirst={dayfirst} was specified. " "This may lead to inconsistently-parsed dates! Specify a format " - "for consistent parsing." + "to ensure consistent parsing." ) cdef: @@ -196,8 +196,8 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): warnings.warn( PARSING_WARNING_MSG.format( date_string=date_string, - format='DD/MM/YYYY' - dayfirst='False', + format='DD/MM/YYYY', + dayfirst='False (the default)', ), stacklevel=4, ) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 449d5a954613b..cf24855210301 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1677,9 +1677,7 @@ def test_parse_delimited_date_swap_with_warning( ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - warning_msg = ( - "Provide format or specify infer_datetime_format=True for consistent parsing" - ) + warning_msg = "Specify a format to ensure consistent parsing" with tm.assert_produces_warning(UserWarning, match=warning_msg): result = parser.read_csv( StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] From 3d65a644b77097d104e239e7935d737eec575663 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 14:33:08 +0200 Subject: [PATCH 3/9] add test to count number of warnings emitted --- pandas/tests/io/parser/test_parse_dates.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index cf24855210301..c454253cce7a4 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1685,6 +1685,14 @@ def test_parse_delimited_date_swap_with_warning( tm.assert_frame_equal(result, expected) +def test_parse_multiple_delimited_dates_with_swap_warnings(): + # GH46210 + warning_msg = "Specify a format to ensure consistent parsing" + with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: + pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) + assert len(record) == 1 + + def _helper_hypothesis_delimited_date(call, date_string, **kwargs): msg, result = None, None try: From 025a19026332edaa8e9ae9b8bb09ff75e02a6f91 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 14:38:39 +0200 Subject: [PATCH 4/9] fix whatsnew note --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b1c75acee550f..2fed373a5c79b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -727,7 +727,7 @@ Other Deprecations - Deprecated the ``display.column_space`` global configuration option (:issue:`7576`) - Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) -- Clarified warning from :meth:`pandas.to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`) +- Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`) .. --------------------------------------------------------------------------- From 4cf7385604999efaa0cecd006dc695b2d133affe Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 14:47:01 +0200 Subject: [PATCH 5/9] fixup test to use set --- pandas/tests/io/parser/test_parse_dates.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c454253cce7a4..2ea6bfa36adfa 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1690,7 +1690,10 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): warning_msg = "Specify a format to ensure consistent parsing" with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - assert len(record) == 1 + assert len(set(record)) == 1 + # Using set(record) as repetitions of the same warning are suppressed + # https://docs.python.org/3/library/warnings.html + # and here we care to check that the warning is only shows once to users. def _helper_hypothesis_delimited_date(call, date_string, **kwargs): From cb152b290e68c2da36eb9d5cec43f90bb62d2be3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 18:38:51 +0200 Subject: [PATCH 6/9] update tests --- pandas/tests/io/parser/test_parse_dates.py | 12 +++++++----- pandas/tests/io/xml/test_xml_dtypes.py | 2 +- pandas/tests/tools/test_to_datetime.py | 5 +++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 2ea6bfa36adfa..5786111614c3f 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1690,7 +1690,7 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): warning_msg = "Specify a format to ensure consistent parsing" with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - assert len(set(record)) == 1 + assert len({str(warning.message) for warning in record}) == 1 # Using set(record) as repetitions of the same warning are suppressed # https://docs.python.org/3/library/warnings.html # and here we care to check that the warning is only shows once to users. @@ -1857,12 +1857,14 @@ def test_parse_dates_and_keep_orgin_column(all_parsers): def test_dayfirst_warnings(): # GH 12585 warning_msg_day_first = ( - "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." + r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) was " + r"specified. This may lead to inconsistently-parsed dates! Specify a format " + r"to ensure consistent parsing." ) warning_msg_month_first = ( - "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." + "Parsing dates in MM/DD/YYYY format when dayfirst=True was " + "specified. This may lead to inconsistently-parsed dates! Specify a format " + "to ensure consistent parsing." ) # CASE 1: valid input diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 6aa4ddfac7628..7347c69f2017e 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -457,7 +457,7 @@ def test_day_first_parse_dates(parser): ) with tm.assert_produces_warning( - UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" + UserWarning, match="Parsing dates DD/MM/YYYY format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4c34b0c0aec0a..58dea52f377b3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1964,8 +1964,9 @@ def test_dayfirst(self, cache): def test_dayfirst_warnings_valid_input(self): # GH 12585 warning_msg_day_first = ( - "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." + r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " + "was specified. This may lead to inconsistently-parsed dates! Specify a " + "format to ensure consistent parsing." ) # CASE 1: valid input From 4eb61c2cf36e3567bb9ca2ff517685a48e52fc4a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 19:11:51 +0200 Subject: [PATCH 7/9] inconsistently-parsed -> inconsistently parsed --- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/tools/test_to_datetime.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index a506bcdee2530..a9441040e0e4b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -86,7 +86,7 @@ _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, PARSING_WARNING_MSG = ( "Parsing dates in {format} format when dayfirst={dayfirst} was specified. " - "This may lead to inconsistently-parsed dates! Specify a format " + "This may lead to inconsistently parsed dates! Specify a format " "to ensure consistent parsing." ) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 5786111614c3f..d05961b702c51 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1858,12 +1858,12 @@ def test_dayfirst_warnings(): # GH 12585 warning_msg_day_first = ( r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) was " - r"specified. This may lead to inconsistently-parsed dates! Specify a format " + r"specified. This may lead to inconsistently parsed dates! Specify a format " r"to ensure consistent parsing." ) warning_msg_month_first = ( "Parsing dates in MM/DD/YYYY format when dayfirst=True was " - "specified. This may lead to inconsistently-parsed dates! Specify a format " + "specified. This may lead to inconsistently parsed dates! Specify a format " "to ensure consistent parsing." ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 58dea52f377b3..32ddc063421d8 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1965,7 +1965,7 @@ def test_dayfirst_warnings_valid_input(self): # GH 12585 warning_msg_day_first = ( r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently-parsed dates! Specify a " + "was specified. This may lead to inconsistently parsed dates! Specify a " "format to ensure consistent parsing." ) From cb7ef75e848bc525121d65d424e9fa9cb9b8d453 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 19:14:20 +0200 Subject: [PATCH 8/9] remove date_string from .format call --- pandas/_libs/tslibs/parsing.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index a9441040e0e4b..5cb11436f6f45 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -186,7 +186,6 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): if dayfirst and not swapped_day_and_month: warnings.warn( PARSING_WARNING_MSG.format( - date_string=date_string, format='MM/DD/YYYY', dayfirst='True', ), @@ -195,7 +194,6 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): elif not dayfirst and swapped_day_and_month: warnings.warn( PARSING_WARNING_MSG.format( - date_string=date_string, format='DD/MM/YYYY', dayfirst='False (the default)', ), From ad7433b4e2f8d559a2b7c3a3f1b1134a6ff0011d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 28 Jun 2022 21:08:38 +0200 Subject: [PATCH 9/9] fixup other tests --- pandas/tests/io/xml/test_xml_dtypes.py | 2 +- pandas/tests/tools/test_to_datetime.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 7347c69f2017e..5629830767c3c 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -457,7 +457,7 @@ def test_day_first_parse_dates(parser): ) with tm.assert_produces_warning( - UserWarning, match="Parsing dates DD/MM/YYYY format" + UserWarning, match="Parsing dates in DD/MM/YYYY format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 32ddc063421d8..f712b4a24e5e5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2002,12 +2002,14 @@ def test_dayfirst_warnings_invalid_input(self): # cannot consistently process with single format # warnings *always* raised warning_msg_day_first = ( - "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." + r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " + "was specified. This may lead to inconsistently parsed dates! Specify a " + "format to ensure consistent parsing." ) warning_msg_month_first = ( - "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " - "format or specify infer_datetime_format=True for consistent parsing." + r"Parsing dates in MM/DD/YYYY format when dayfirst=True " + "was specified. This may lead to inconsistently parsed dates! Specify a " + "format to ensure consistent parsing." ) arr = ["31/12/2014", "03/30/2011"]