diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1c7942dfedafa..57516a7d039e6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -813,6 +813,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`) - Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) +- Bug in :func:`to_datetime` raising ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`) Sparse ^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 77bbde62d607e..102cdf4334510 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -84,6 +84,7 @@ Scalar = Union[int, float, str] DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] +start_caching_at = 50 # --------------------------------------------------------------------- @@ -130,7 +131,7 @@ def should_cache( # default realization if check_count is None: # in this case, the gain from caching is negligible - if len(arg) <= 50: + if len(arg) <= start_caching_at: return False if len(arg) <= 5000: @@ -193,6 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) + if not cache_array.is_unique: + # GH#39882 in case of None and NaT we get duplicates + cache_array = cache_array.drop_duplicates() return cache_array diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 8283bcd16dbad..81203b944fa92 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -13,6 +13,7 @@ # for most cases), and the specific cases where the result deviates from # this default. Those overrides are defined as a dict with (keyword, val) as # dictionary key. In case of multiple items, the last override takes precedence. + test_cases = [ ( # data diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index cefbea529e366..3e0c12c6a22cc 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -43,6 +43,7 @@ import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools +from pandas.core.tools.datetimes import start_caching_at class TestTimeConversionFormats: @@ -956,6 +957,19 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected + def test_convert_object_to_datetime_with_cache(self): + # GH#39882 + ser = Series( + [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + dtype="object", + ) + result = to_datetime(ser, errors="coerce") + expected = Series( + [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + dtype="datetime64[ns]", + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "date, format", [