From 26c55ca1b00afd0814f4ca3fa7b99d79f1eab93b Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 17 Apr 2021 23:32:58 +0200 Subject: [PATCH 1/3] Bug in to_datetime raising ValueError with None and NaT and more than 50 elements --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/tools/datetimes.py | 3 +++ pandas/tests/series/methods/test_convert_dtypes.py | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1c7942dfedafa..57516a7d039e6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -813,6 +813,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`) - Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) +- Bug in :func:`to_datetime` raising ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`) Sparse ^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 77bbde62d607e..39efd70aa6717 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -193,6 +193,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) + if not cache_array.is_unique: + # GH#39882 in case of None and NaT we get duplicates + cache_array = cache_array.drop_duplicates() return cache_array diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 8283bcd16dbad..2043d43216b2c 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -225,3 +225,14 @@ def test_convert_bool_dtype(self): # GH32287 df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes()) + + def test_convert_object_to_datetime_with_cache(self): + # GH#39882 + ser = pd.Series( + [None] + [pd.NaT] * 50 + [pd.Timestamp("2012-07-26")], dtype="object" + ) + result = pd.to_datetime(ser, errors="coerce") + expected = pd.Series( + [pd.NaT] * 51 + [pd.Timestamp("2012-07-26")], dtype="datetime64[ns]" + ) + tm.assert_series_equal(result, expected) From 0e82619f6cc622d15cf2217bad0b94700cd2ae67 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 18 Apr 2021 23:26:42 +0200 Subject: [PATCH 2/3] Use parameter to keep test valid --- pandas/core/tools/datetimes.py | 3 ++- pandas/tests/series/methods/test_convert_dtypes.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 39efd70aa6717..102cdf4334510 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -84,6 +84,7 @@ Scalar = Union[int, float, str] DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] +start_caching_at = 50 # --------------------------------------------------------------------- @@ -130,7 +131,7 @@ def should_cache( # default realization if check_count is None: # in this case, the gain from caching is negligible - if len(arg) <= 50: + if len(arg) <= start_caching_at: return False if len(arg) <= 5000: diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 2043d43216b2c..eac1ef896ebfe 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -13,6 +13,8 @@ # for most cases), and the specific cases where the result deviates from # this default. Those overrides are defined as a dict with (keyword, val) as # dictionary key. In case of multiple items, the last override takes precedence. +from pandas.core.tools.datetimes import start_caching_at + test_cases = [ ( # data @@ -229,10 +231,12 @@ def test_convert_bool_dtype(self): def test_convert_object_to_datetime_with_cache(self): # GH#39882 ser = pd.Series( - [None] + [pd.NaT] * 50 + [pd.Timestamp("2012-07-26")], dtype="object" + [None] + [pd.NaT] * start_caching_at + [pd.Timestamp("2012-07-26")], + dtype="object", ) result = pd.to_datetime(ser, errors="coerce") expected = pd.Series( - [pd.NaT] * 51 + [pd.Timestamp("2012-07-26")], dtype="datetime64[ns]" + [pd.NaT] * (start_caching_at + 1) + [pd.Timestamp("2012-07-26")], + dtype="datetime64[ns]", ) tm.assert_series_equal(result, expected) From c4a141e336fc00280d83dd7593add680b2c00f18 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 20 Apr 2021 01:24:24 +0200 Subject: [PATCH 3/3] Move test --- pandas/tests/series/methods/test_convert_dtypes.py | 14 -------------- pandas/tests/tools/test_to_datetime.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index eac1ef896ebfe..81203b944fa92 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -13,7 +13,6 @@ # for most cases), and the specific cases where the result deviates from # this default. Those overrides are defined as a dict with (keyword, val) as # dictionary key. In case of multiple items, the last override takes precedence. -from pandas.core.tools.datetimes import start_caching_at test_cases = [ ( @@ -227,16 +226,3 @@ def test_convert_bool_dtype(self): # GH32287 df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes()) - - def test_convert_object_to_datetime_with_cache(self): - # GH#39882 - ser = pd.Series( - [None] + [pd.NaT] * start_caching_at + [pd.Timestamp("2012-07-26")], - dtype="object", - ) - result = pd.to_datetime(ser, errors="coerce") - expected = pd.Series( - [pd.NaT] * (start_caching_at + 1) + [pd.Timestamp("2012-07-26")], - dtype="datetime64[ns]", - ) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index cefbea529e366..3e0c12c6a22cc 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -43,6 +43,7 @@ import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools +from pandas.core.tools.datetimes import start_caching_at class TestTimeConversionFormats: @@ -956,6 +957,19 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected + def test_convert_object_to_datetime_with_cache(self): + # GH#39882 + ser = Series( + [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + dtype="object", + ) + result = to_datetime(ser, errors="coerce") + expected = Series( + [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + dtype="datetime64[ns]", + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "date, format", [