From 76e9d3dd509bc720f4ce4a0b3182bfa1202f5351 Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sat, 26 Jun 2021 23:35:26 +0800 Subject: [PATCH 1/8] fix --- pandas/core/tools/datetimes.py | 4 ++-- pandas/tests/tools/test_to_datetime.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 014a702618bda..6b431fce6f290 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,9 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) - if not cache_array.is_unique: + if not cache_array.index.is_unique: # GH#39882 in case of None and NaT we get duplicates - cache_array = cache_array.drop_duplicates() + cache_array = cache_array[~cache_array.index.unique()] return cache_array diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 121ca99785831..6c214f1ef50d1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,15 +957,26 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected - def test_convert_object_to_datetime_with_cache(self): + @pytest.mark.parametrize("datetime_likes, expected", [ + ( + [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + ), + ( + [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26"), "2012 July 26", "2012-07-26"], + [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26"), Timestamp("2012-07-26"), Timestamp("2012-07-26")], + ) + ] + ) + def test_convert_object_to_datetime_with_cache(self, datetime_likes, expected): # GH#39882 ser = Series( - [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + datetime_likes, dtype="object", ) result = to_datetime(ser, errors="coerce") expected = Series( - [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + dtype="datetime64[ns]", ) tm.assert_series_equal(result, expected) From 458a726720b338d78676c5a2c7981b4119f3993c Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 00:37:27 +0800 Subject: [PATCH 2/8] fix 2 --- pandas/core/tools/datetimes.py | 5 +++-- pandas/tests/tools/test_to_datetime.py | 31 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6b431fce6f290..f78299ef5181a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -195,8 +195,9 @@ def _maybe_cache( cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) if not cache_array.index.is_unique: - # GH#39882 in case of None and NaT we get duplicates - cache_array = cache_array[~cache_array.index.unique()] + cache_array = cache_array[ + ~cache_array.index.duplicated() + ] return cache_array diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6c214f1ef50d1..43791b479c36a 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,26 +957,41 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected - @pytest.mark.parametrize("datetime_likes, expected", [ + @pytest.mark.parametrize( + "datetimelikes, expected", + [ + ( + [None] + [NaT] * start_caching_at + [np.nan], + [NaT] * (start_caching_at + 2), + ), ( [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], ), ( - [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26"), "2012 July 26", "2012-07-26"], - [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26"), Timestamp("2012-07-26"), Timestamp("2012-07-26")], - ) - ] + [None] + + [NaT] * start_caching_at + + [ + "2012 July 26", + Timestamp("2012-07-26"), + ], + [NaT] * (start_caching_at + 1) + + [ + Timestamp("2012-07-26"), + Timestamp("2012-07-26"), + ], + ), + ], ) - def test_convert_object_to_datetime_with_cache(self, datetime_likes, expected): + def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected): # GH#39882 ser = Series( - datetime_likes, + datetimelikes, dtype="object", ) result = to_datetime(ser, errors="coerce") expected = Series( - + expected, dtype="datetime64[ns]", ) tm.assert_series_equal(result, expected) From 87505f485e6d0e2d815a21c34f90902f63f923f7 Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 02:05:59 +0800 Subject: [PATCH 3/8] mypy fix --- pandas/core/tools/datetimes.py | 5 ++-- pandas/tests/tools/test_to_datetime.py | 36 +++++++++++--------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f78299ef5181a..74b5de3a70a80 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,10 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) + # GH#39882 and GH#35888 if not cache_array.index.is_unique: - cache_array = cache_array[ - ~cache_array.index.duplicated() - ] + cache_array = cache_array[~cache_array.index.duplicated()] return cache_array diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 43791b479c36a..f7f9e94b7bb9e 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -8,6 +8,7 @@ ) from decimal import Decimal import locale +from typing import Any, List, Union from dateutil.parser import parse from dateutil.tz.tz import tzoffset @@ -958,30 +959,23 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes, expected", - [ + "datetimelikes,expected", + ( ( - [None] + [NaT] * start_caching_at + [np.nan], - [NaT] * (start_caching_at + 2), + (None, np.nan) + (NaT,) * start_caching_at, + (NaT,) * (start_caching_at + 2), ), ( - [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], - [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, ), ( - [None] - + [NaT] * start_caching_at - + [ - "2012 July 26", - Timestamp("2012-07-26"), - ], - [NaT] * (start_caching_at + 1) - + [ - Timestamp("2012-07-26"), - Timestamp("2012-07-26"), - ], + (None, "2012 July 26", Timestamp("2012-07-26")) + + (NaT,) * start_caching_at, + (NaT, Timestamp("2012-07-26"), Timestamp("2012-07-26")) + + (NaT,) * start_caching_at, ), - ], + ), ) def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected): # GH#39882 @@ -989,12 +983,12 @@ def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected): datetimelikes, dtype="object", ) - result = to_datetime(ser, errors="coerce") - expected = Series( + result_series = to_datetime(ser, errors="coerce") + expected_series = Series( expected, dtype="datetime64[ns]", ) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_series, expected_series) @pytest.mark.parametrize( "date, format", From 8d1f006762cdda59607d834a12e8591b494d1c53 Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 02:08:27 +0800 Subject: [PATCH 4/8] fix variable name --- pandas/tests/tools/test_to_datetime.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f7f9e94b7bb9e..09060ed759b6f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -959,7 +959,7 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected", + "datetimelikes,expected_values", ( ( (None, np.nan) + (NaT,) * start_caching_at, @@ -977,7 +977,7 @@ def test_to_datetime_cache_scalar(self): ), ), ) - def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected): + def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected_values): # GH#39882 ser = Series( datetimelikes, @@ -985,7 +985,7 @@ def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected): ) result_series = to_datetime(ser, errors="coerce") expected_series = Series( - expected, + expected_values, dtype="datetime64[ns]", ) tm.assert_series_equal(result_series, expected_series) From 06dfcb19e7ccee421e982016ff8e9b8027d5347b Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 02:15:32 +0800 Subject: [PATCH 5/8] format --- pandas/tests/tools/test_to_datetime.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 09060ed759b6f..978363bffe7e2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -8,7 +8,6 @@ ) from decimal import Decimal import locale -from typing import Any, List, Union from dateutil.parser import parse from dateutil.tz.tz import tzoffset @@ -977,7 +976,9 @@ def test_to_datetime_cache_scalar(self): ), ), ) - def test_convert_object_to_datetime_with_cache(self, datetimelikes, expected_values): + def test_convert_object_to_datetime_with_cache( + self, datetimelikes, expected_values + ): # GH#39882 ser = Series( datetimelikes, From 6d3a30513419c3cb3f13cbf7edf18d8bb673abbe Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 02:23:36 +0800 Subject: [PATCH 6/8] comment --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 74b5de3a70a80..7414fdf190936 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,7 +194,7 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) - # GH#39882 and GH#35888 + # GH#39882 and GH#35888 in case of None and NaT we get duplicates if not cache_array.index.is_unique: cache_array = cache_array[~cache_array.index.duplicated()] return cache_array From 51382c9c08b9e895725a1c6e6ded9a8932d45f23 Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sun, 27 Jun 2021 02:48:44 +0800 Subject: [PATCH 7/8] fix counter-example --- pandas/tests/tools/test_to_datetime.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 978363bffe7e2..9da7951c199ca 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -969,10 +969,11 @@ def test_to_datetime_cache_scalar(self): (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, ), ( - (None, "2012 July 26", Timestamp("2012-07-26")) - + (NaT,) * start_caching_at, - (NaT, Timestamp("2012-07-26"), Timestamp("2012-07-26")) - + (NaT,) * start_caching_at, + (None,) + + (NaT,) * start_caching_at + + ("2012 July 26", Timestamp("2012-07-26")), + (NaT,) * (start_caching_at + 1) + + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), ), ), ) From 7130d93cd95859d44d5b1aeb6bee979744f4774f Mon Sep 17 00:00:00 2001 From: Zheyuan Chen Date: Sat, 3 Jul 2021 01:03:44 +0800 Subject: [PATCH 8/8] added whatsnew Datetimelike --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 81545ada63ce5..86ae571ff0b77 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -122,7 +122,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Bug in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) - Timedelta