Bug in to_datetime raising ValueError with None and NaT and more than 50 elements (pandas-dev#41006)

phofl · yeshsurya · commit c94d8a3b6165 · 2021-04-21T11:41:41.000+05:30
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -858,6 +858,7 @@ Reshaping
 - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`)
 - Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`)
 - Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`)
+- Bug in :func:`to_datetime` raising ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -84,6 +84,7 @@
 Scalar = Union[int, float, str]
 DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime)
 DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
+start_caching_at = 50
 
 
 # ---------------------------------------------------------------------
@@ -130,7 +131,7 @@ def should_cache(
     # default realization
     if check_count is None:
         # in this case, the gain from caching is negligible
-        if len(arg) <= 50:
+        if len(arg) <= start_caching_at:
             return False
 
         if len(arg) <= 5000:
@@ -193,6 +194,9 @@ def _maybe_cache(
         if len(unique_dates) < len(arg):
             cache_dates = convert_listlike(unique_dates, format)
             cache_array = Series(cache_dates, index=unique_dates)
+            if not cache_array.is_unique:
+                # GH#39882 in case of None and NaT we get duplicates
+                cache_array = cache_array.drop_duplicates()
     return cache_array
 
 
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -13,6 +13,7 @@
 # for most cases), and the specific cases where the result deviates from
 # this default. Those overrides are defined as a dict with (keyword, val) as
 # dictionary key. In case of multiple items, the last override takes precedence.
+
 test_cases = [
     (
         # data
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -43,6 +43,7 @@
 import pandas._testing as tm
 from pandas.core.arrays import DatetimeArray
 from pandas.core.tools import datetimes as tools
+from pandas.core.tools.datetimes import start_caching_at
 
 
 class TestTimeConversionFormats:
@@ -956,6 +957,19 @@ def test_to_datetime_cache_scalar(self):
         expected = Timestamp("20130101 00:00:00")
         assert result == expected
 
+    def test_convert_object_to_datetime_with_cache(self):
+        # GH#39882
+        ser = Series(
+            [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")],
+            dtype="object",
+        )
+        result = to_datetime(ser, errors="coerce")
+        expected = Series(
+            [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")],
+            dtype="datetime64[ns]",
+        )
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize(
         "date, format",
         [

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`# for most cases), and the specific cases where the result deviates from`
`14`	`14`	`# this default. Those overrides are defined as a dict with (keyword, val) as`
`15`	`15`	`# dictionary key. In case of multiple items, the last override takes precedence.`
	`16`	`+`
`16`	`17`	`test_cases = [`
`17`	`18`	`(`
`18`	`19`	`# data`