Skip to content

Commit 1781d5a

Browse files
Backport PR #42261 (BUG: Fix to_datetime() cache behaviour to not omit duplicated output values) (#42660)
Co-authored-by: Zheyuan <[email protected]>
1 parent f321369 commit 1781d5a

File tree

3 files changed

+33
-9
lines changed

3 files changed

+33
-9
lines changed

doc/source/whatsnew/v1.3.1.rst

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Fixed regressions
2525
- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
2626
- Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`)
2727
- Bug in :class:`Series` constructor not accepting a ``dask.Array`` (:issue:`38645`)
28+
- Fixed regression in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`)
29+
2830

2931
.. ---------------------------------------------------------------------------
3032

pandas/core/tools/datetimes.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,9 @@ def _maybe_cache(
194194
if len(unique_dates) < len(arg):
195195
cache_dates = convert_listlike(unique_dates, format)
196196
cache_array = Series(cache_dates, index=unique_dates)
197-
if not cache_array.is_unique:
198-
# GH#39882 in case of None and NaT we get duplicates
199-
cache_array = cache_array.drop_duplicates()
197+
# GH#39882 and GH#35888 in case of None and NaT we get duplicates
198+
if not cache_array.index.is_unique:
199+
cache_array = cache_array[~cache_array.index.duplicated()]
200200
return cache_array
201201

202202

pandas/tests/tools/test_to_datetime.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -957,18 +957,40 @@ def test_to_datetime_cache_scalar(self):
957957
expected = Timestamp("20130101 00:00:00")
958958
assert result == expected
959959

960-
def test_convert_object_to_datetime_with_cache(self):
960+
@pytest.mark.parametrize(
961+
"datetimelikes,expected_values",
962+
(
963+
(
964+
(None, np.nan) + (NaT,) * start_caching_at,
965+
(NaT,) * (start_caching_at + 2),
966+
),
967+
(
968+
(None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at,
969+
(NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at,
970+
),
971+
(
972+
(None,)
973+
+ (NaT,) * start_caching_at
974+
+ ("2012 July 26", Timestamp("2012-07-26")),
975+
(NaT,) * (start_caching_at + 1)
976+
+ (Timestamp("2012-07-26"), Timestamp("2012-07-26")),
977+
),
978+
),
979+
)
980+
def test_convert_object_to_datetime_with_cache(
981+
self, datetimelikes, expected_values
982+
):
961983
# GH#39882
962984
ser = Series(
963-
[None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")],
985+
datetimelikes,
964986
dtype="object",
965987
)
966-
result = to_datetime(ser, errors="coerce")
967-
expected = Series(
968-
[NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")],
988+
result_series = to_datetime(ser, errors="coerce")
989+
expected_series = Series(
990+
expected_values,
969991
dtype="datetime64[ns]",
970992
)
971-
tm.assert_series_equal(result, expected)
993+
tm.assert_series_equal(result_series, expected_series)
972994

973995
@pytest.mark.parametrize(
974996
"date, format",

0 commit comments

Comments
 (0)