Skip to content

Commit 54bd5cd

Browse files
authored
Bug in to_datetime raising ValueError with None and NaT and more than 50 elements (#41006)
1 parent c14712e commit 54bd5cd

File tree

4 files changed

+21
-1
lines changed

4 files changed

+21
-1
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,7 @@ Reshaping
854854
- Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`)
855855
- Bug in :func:`to_datetime` raising error when input sequence contains unhashable items (:issue:`39756`)
856856
- Bug in :meth:`Series.explode` preserving index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`)
857+
- Bug in :func:`to_datetime` raising ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`)
857858

858859
Sparse
859860
^^^^^^

pandas/core/tools/datetimes.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
Scalar = Union[int, float, str]
8585
DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime)
8686
DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
87+
start_caching_at = 50
8788

8889

8990
# ---------------------------------------------------------------------
@@ -130,7 +131,7 @@ def should_cache(
130131
# default realization
131132
if check_count is None:
132133
# in this case, the gain from caching is negligible
133-
if len(arg) <= 50:
134+
if len(arg) <= start_caching_at:
134135
return False
135136

136137
if len(arg) <= 5000:
@@ -193,6 +194,9 @@ def _maybe_cache(
193194
if len(unique_dates) < len(arg):
194195
cache_dates = convert_listlike(unique_dates, format)
195196
cache_array = Series(cache_dates, index=unique_dates)
197+
if not cache_array.is_unique:
198+
# GH#39882 in case of None and NaT we get duplicates
199+
cache_array = cache_array.drop_duplicates()
196200
return cache_array
197201

198202

pandas/tests/series/methods/test_convert_dtypes.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# for most cases), and the specific cases where the result deviates from
1414
# this default. Those overrides are defined as a dict with (keyword, val) as
1515
# dictionary key. In case of multiple items, the last override takes precedence.
16+
1617
test_cases = [
1718
(
1819
# data

pandas/tests/tools/test_to_datetime.py

+14
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import pandas._testing as tm
4444
from pandas.core.arrays import DatetimeArray
4545
from pandas.core.tools import datetimes as tools
46+
from pandas.core.tools.datetimes import start_caching_at
4647

4748

4849
class TestTimeConversionFormats:
@@ -956,6 +957,19 @@ def test_to_datetime_cache_scalar(self):
956957
expected = Timestamp("20130101 00:00:00")
957958
assert result == expected
958959

960+
def test_convert_object_to_datetime_with_cache(self):
961+
# GH#39882
962+
ser = Series(
963+
[None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")],
964+
dtype="object",
965+
)
966+
result = to_datetime(ser, errors="coerce")
967+
expected = Series(
968+
[NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")],
969+
dtype="datetime64[ns]",
970+
)
971+
tm.assert_series_equal(result, expected)
972+
959973
@pytest.mark.parametrize(
960974
"date, format",
961975
[

0 commit comments

Comments
 (0)